[DEV] update normalisation preoprorcess
This commit is contained in:
parent
93912b54a3
commit
4b3c96595e
@ -14,7 +14,10 @@ import math
|
|||||||
import json
|
import json
|
||||||
import resampy
|
import resampy
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import scipy.io.wavfile as wavfile
|
import scipy.io.wavfile
|
||||||
|
import scipy.signal
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("-i", "--input", help="Input directory",
|
parser.add_argument("-i", "--input", help="Input directory",
|
||||||
@ -81,21 +84,137 @@ for elem in audio_corpus_element:
|
|||||||
debug.error(" ==> missing field 'audio_filename' ...")
|
debug.error(" ==> missing field 'audio_filename' ...")
|
||||||
filename = os.path.join(os.path.dirname(elem), input_data["audio_filename"])
|
filename = os.path.join(os.path.dirname(elem), input_data["audio_filename"])
|
||||||
if filename[-3:] == "wav":
|
if filename[-3:] == "wav":
|
||||||
sample_rate, audio_data = wavfile.read(filename)
|
sample_rate, audio_data = scipy.io.wavfile.read(filename)
|
||||||
else:
|
else:
|
||||||
debug.error("Not supported file type: '" + str(filename[-3:]) + "' suported: [wav]")
|
debug.error("Not supported file type: '" + str(filename[-3:]) + "' suported: [wav]")
|
||||||
debug.info("Read: " + str(len(audio_data)) + " sample(s)")
|
debug.info("Read: " + str(len(audio_data)) + " sample(s)")
|
||||||
|
|
||||||
audio_16k = resampy.resample(audio_data, 48000, 16000)
|
|
||||||
|
#######################################################
|
||||||
|
## Step 1: Generate speech areas (work on input, because I am sure that the data is not modified
|
||||||
|
#######################################################
|
||||||
|
audio_data_absolute = np.absolute(audio_data)
|
||||||
|
shape = np.copy(audio_data_absolute)
|
||||||
|
# average of the anbiant noise
|
||||||
|
avg = 0
|
||||||
|
# count represent the number of basic sample to etalonate the generic ambiant noise
|
||||||
|
count = 0
|
||||||
|
count_all = 0
|
||||||
|
set_value = 32767
|
||||||
|
# number of sampel of the anbiant noise calculation
|
||||||
|
ambiant_basic_size = 48000/4 # : 1/4 s ==> the cormus is manage to have 1 second before starting to have real data voice
|
||||||
|
for sample_audio in audio_data_absolute:
|
||||||
|
#shape[count_all] = 0
|
||||||
|
count_all += 1
|
||||||
|
if count < ambiant_basic_size:
|
||||||
|
# Remove the 0 padding
|
||||||
|
if count == 0 and sample_audio == 0:
|
||||||
|
continue
|
||||||
|
elif count == 0:
|
||||||
|
debug.info("start data " + str(count_all) + " samples (at value 0)")
|
||||||
|
count += 1
|
||||||
|
avg += sample_audio
|
||||||
|
elif count == ambiant_basic_size:
|
||||||
|
count += 1
|
||||||
|
avg /= ambiant_basic_size
|
||||||
|
debug.info("basic ambiant is: " + str(avg) + " start annalyse at : " + str(count_all/48000) + " sec")
|
||||||
|
if avg <= 327.67:
|
||||||
|
avg = int(327.67)
|
||||||
|
else:
|
||||||
|
avg = int(avg * 1.1)
|
||||||
|
## set_value = avg
|
||||||
|
debug.info(" inspect at " + str(avg))
|
||||||
|
else:
|
||||||
|
if sample_audio >= avg:
|
||||||
|
shape[count_all-1] = set_value;
|
||||||
|
|
||||||
|
#filename_out = os.path.join(args.output, input_data["audio_filename"] + "_ori.wav")
|
||||||
|
#scipy.io.wavfile.write(filename_out, 48000, audio_data)
|
||||||
|
#filename_out = os.path.join(args.output, input_data["audio_filename"] + "_abs.wav")
|
||||||
|
#scipy.io.wavfile.write(filename_out, 48000, audio_data_absolute)
|
||||||
|
#filename_out = os.path.join(args.output, input_data["audio_filename"] + "_shape1.wav")
|
||||||
|
#scipy.io.wavfile.write(filename_out, 48000, shape)
|
||||||
|
|
||||||
|
windows = int(48000 * 0.02)
|
||||||
|
count_all = 0
|
||||||
|
count = 0
|
||||||
|
for sample_audio in shape:
|
||||||
|
count_all += 1
|
||||||
|
if sample_audio == set_value:
|
||||||
|
count = windows
|
||||||
|
continue
|
||||||
|
count -= 1
|
||||||
|
if count >= 0:
|
||||||
|
shape[count_all-1] = set_value;
|
||||||
|
shape = np.flip(shape)
|
||||||
|
count_all = 0
|
||||||
|
count = 0
|
||||||
|
for sample_audio in shape:
|
||||||
|
count_all += 1
|
||||||
|
if sample_audio == set_value:
|
||||||
|
count = windows
|
||||||
|
continue
|
||||||
|
count -= 1
|
||||||
|
if count >= 0:
|
||||||
|
shape[count_all-1] = set_value;
|
||||||
|
else:
|
||||||
|
shape[count_all-1] = 0
|
||||||
|
shape = np.flip(shape)
|
||||||
|
|
||||||
|
filename_out = os.path.join(args.output, input_data["audio_filename"] + "_shape.wav")
|
||||||
|
scipy.io.wavfile.write(filename_out, 48000, shape)
|
||||||
|
|
||||||
|
count_all = 0
|
||||||
|
table_voice_detected = [[0,False]]
|
||||||
|
previous = False
|
||||||
|
# use numpy slicing: [start:stop:step] https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html#basic-slicing-and-indexing
|
||||||
|
for sample_audio in shape[3:len(shape):3]:
|
||||||
|
count_all += 1
|
||||||
|
if previous == False:
|
||||||
|
if sample_audio == set_value:
|
||||||
|
table_voice_detected.append([count_all, True])
|
||||||
|
previous = True
|
||||||
|
else:
|
||||||
|
if sample_audio == 0:
|
||||||
|
table_voice_detected.append([count_all, False])
|
||||||
|
previous = False
|
||||||
|
|
||||||
|
#######################################################
|
||||||
|
## Step 2: Resample
|
||||||
|
#######################################################
|
||||||
|
audio_16k = resampy.resample(audio_data, 48000, 16000, filter='kaiser_best')
|
||||||
|
#audio_16k = resampy.resample(audio_data, 48000, 16000, filter='kaiser_fast')
|
||||||
|
|
||||||
debug.info("write: " + str(len(audio_16k)) + " sample(s)")
|
debug.info("write: " + str(len(audio_16k)) + " sample(s)")
|
||||||
|
|
||||||
filename_out = os.path.join(args.output, input_data["audio_filename"])
|
filename_out = os.path.join(args.output, input_data["audio_filename"])
|
||||||
wavfile.write(filename_out, 16000, audio_16k)
|
scipy.io.wavfile.write(filename_out, 16000, audio_16k)
|
||||||
input_data["audio_sample_rate"] = 16000
|
# create new data format:
|
||||||
|
output_data = {
|
||||||
|
"value": input_data["value"],
|
||||||
|
"language": input_data["language"],
|
||||||
|
"audio_sample_rate": 16000,
|
||||||
|
"audio_filename": input_data["audio_filename"],
|
||||||
|
"VAD": table_voice_detected,
|
||||||
|
"action": [
|
||||||
|
{
|
||||||
|
"type": "resampling",
|
||||||
|
"tool": "resampy",
|
||||||
|
"desc": "48000 ==> 16000",
|
||||||
|
"src": elem,
|
||||||
|
},{
|
||||||
|
"type": "auto VAD",
|
||||||
|
"tool": "internal_1",
|
||||||
|
"desc": "When control the input data, with small noise, we can detect voice, just with signal power ...",
|
||||||
|
"src": elem,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
filename_out_json = os.path.join(args.output, os.path.basename(elem))
|
filename_out_json = os.path.join(args.output, os.path.basename(elem))
|
||||||
with open(filename_out_json, 'w') as outfile:
|
with open(filename_out_json, 'w') as outfile:
|
||||||
json.dump(input_data, outfile, indent="\t")
|
json.dump(output_data, outfile, indent="\t")
|
||||||
|
|
||||||
|
|
||||||
debug.info("Finish")
|
debug.info("Finish")
|
||||||
|
Loading…
x
Reference in New Issue
Block a user