From 4b3c96595e15b044b528399021481bd9ac400c40 Mon Sep 17 00:00:00 2001 From: Edouard DUPIN Date: Mon, 1 Apr 2019 22:21:19 +0200 Subject: [PATCH] [DEV] update normalisation preoprorcess --- .../audio-reco-corpus/preprocessCorpus.py | 131 +++++++++++++++++- 1 file changed, 125 insertions(+), 6 deletions(-) diff --git a/normalizer_corpus/audio-reco-corpus/preprocessCorpus.py b/normalizer_corpus/audio-reco-corpus/preprocessCorpus.py index 9ee89b0..85a1cec 100755 --- a/normalizer_corpus/audio-reco-corpus/preprocessCorpus.py +++ b/normalizer_corpus/audio-reco-corpus/preprocessCorpus.py @@ -14,7 +14,10 @@ import math import json import resampy import numpy as np -import scipy.io.wavfile as wavfile +import scipy.io.wavfile +import scipy.signal + + parser = argparse.ArgumentParser() parser.add_argument("-i", "--input", help="Input directory", @@ -81,21 +84,137 @@ for elem in audio_corpus_element: debug.error(" ==> missing field 'audio_filename' ...") filename = os.path.join(os.path.dirname(elem), input_data["audio_filename"]) if filename[-3:] == "wav": - sample_rate, audio_data = wavfile.read(filename) + sample_rate, audio_data = scipy.io.wavfile.read(filename) else: debug.error("Not supported file type: '" + str(filename[-3:]) + "' suported: [wav]") debug.info("Read: " + str(len(audio_data)) + " sample(s)") - audio_16k = resampy.resample(audio_data, 48000, 16000) + + ####################################################### + ## Step 1: Generate speech areas (work on input, because I am sure that the data is not modified + ####################################################### + audio_data_absolute = np.absolute(audio_data) + shape = np.copy(audio_data_absolute) + # average of the anbiant noise + avg = 0 + # count represent the number of basic sample to etalonate the generic ambiant noise + count = 0 + count_all = 0 + set_value = 32767 + # number of sampel of the anbiant noise calculation + ambiant_basic_size = 48000/4 # : 1/4 s ==> the cormus is manage to have 1 second before starting to have real data voice + for sample_audio in audio_data_absolute: + #shape[count_all] = 0 + count_all += 1 + if count < ambiant_basic_size: + # Remove the 0 padding + if count == 0 and sample_audio == 0: + continue + elif count == 0: + debug.info("start data " + str(count_all) + " samples (at value 0)") + count += 1 + avg += sample_audio + elif count == ambiant_basic_size: + count += 1 + avg /= ambiant_basic_size + debug.info("basic ambiant is: " + str(avg) + " start annalyse at : " + str(count_all/48000) + " sec") + if avg <= 327.67: + avg = int(327.67) + else: + avg = int(avg * 1.1) + ## set_value = avg + debug.info(" inspect at " + str(avg)) + else: + if sample_audio >= avg: + shape[count_all-1] = set_value; + + #filename_out = os.path.join(args.output, input_data["audio_filename"] + "_ori.wav") + #scipy.io.wavfile.write(filename_out, 48000, audio_data) + #filename_out = os.path.join(args.output, input_data["audio_filename"] + "_abs.wav") + #scipy.io.wavfile.write(filename_out, 48000, audio_data_absolute) + #filename_out = os.path.join(args.output, input_data["audio_filename"] + "_shape1.wav") + #scipy.io.wavfile.write(filename_out, 48000, shape) + + windows = int(48000 * 0.02) + count_all = 0 + count = 0 + for sample_audio in shape: + count_all += 1 + if sample_audio == set_value: + count = windows + continue + count -= 1 + if count >= 0: + shape[count_all-1] = set_value; + shape = np.flip(shape) + count_all = 0 + count = 0 + for sample_audio in shape: + count_all += 1 + if sample_audio == set_value: + count = windows + continue + count -= 1 + if count >= 0: + shape[count_all-1] = set_value; + else: + shape[count_all-1] = 0 + shape = np.flip(shape) + + filename_out = os.path.join(args.output, input_data["audio_filename"] + "_shape.wav") + scipy.io.wavfile.write(filename_out, 48000, shape) + + count_all = 0 + table_voice_detected = [[0,False]] + previous = False + # use numpy slicing: [start:stop:step] https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html#basic-slicing-and-indexing + for sample_audio in shape[3:len(shape):3]: + count_all += 1 + if previous == False: + if sample_audio == set_value: + table_voice_detected.append([count_all, True]) + previous = True + else: + if sample_audio == 0: + table_voice_detected.append([count_all, False]) + previous = False + + ####################################################### + ## Step 2: Resample + ####################################################### + audio_16k = resampy.resample(audio_data, 48000, 16000, filter='kaiser_best') + #audio_16k = resampy.resample(audio_data, 48000, 16000, filter='kaiser_fast') debug.info("write: " + str(len(audio_16k)) + " sample(s)") filename_out = os.path.join(args.output, input_data["audio_filename"]) - wavfile.write(filename_out, 16000, audio_16k) - input_data["audio_sample_rate"] = 16000 + scipy.io.wavfile.write(filename_out, 16000, audio_16k) + # create new data format: + output_data = { + "value": input_data["value"], + "language": input_data["language"], + "audio_sample_rate": 16000, + "audio_filename": input_data["audio_filename"], + "VAD": table_voice_detected, + "action": [ + { + "type": "resampling", + "tool": "resampy", + "desc": "48000 ==> 16000", + "src": elem, + },{ + "type": "auto VAD", + "tool": "internal_1", + "desc": "When control the input data, with small noise, we can detect voice, just with signal power ...", + "src": elem, + }, + ], + } + + filename_out_json = os.path.join(args.output, os.path.basename(elem)) with open(filename_out_json, 'w') as outfile: - json.dump(input_data, outfile, indent="\t") + json.dump(output_data, outfile, indent="\t") debug.info("Finish")