From 4b3c96595e15b044b528399021481bd9ac400c40 Mon Sep 17 00:00:00 2001
From: Edouard DUPIN <yui.heero@gmail.com>
Date: Mon, 1 Apr 2019 22:21:19 +0200
Subject: [PATCH] [DEV] update normalisation preoprorcess

---
 .../audio-reco-corpus/preprocessCorpus.py     | 131 +++++++++++++++++-
 1 file changed, 125 insertions(+), 6 deletions(-)

diff --git a/normalizer_corpus/audio-reco-corpus/preprocessCorpus.py b/normalizer_corpus/audio-reco-corpus/preprocessCorpus.py
index 9ee89b0..85a1cec 100755
--- a/normalizer_corpus/audio-reco-corpus/preprocessCorpus.py
+++ b/normalizer_corpus/audio-reco-corpus/preprocessCorpus.py
@@ -14,7 +14,10 @@ import math
 import json
 import resampy
 import numpy as np
-import scipy.io.wavfile as wavfile
+import scipy.io.wavfile
+import scipy.signal
+
+
 
 parser = argparse.ArgumentParser()
 parser.add_argument("-i", "--input", help="Input directory",
@@ -81,21 +84,137 @@ for elem in audio_corpus_element:
 		debug.error(" ==> missing field 'audio_filename' ...")
 	filename = os.path.join(os.path.dirname(elem), input_data["audio_filename"])
 	if filename[-3:] == "wav":
-		sample_rate, audio_data = wavfile.read(filename)
+		sample_rate, audio_data = scipy.io.wavfile.read(filename)
 	else:
 		debug.error("Not supported file type: '" + str(filename[-3:]) + "' suported: [wav]")
 	debug.info("Read: " + str(len(audio_data)) + " sample(s)")
 	
-	audio_16k = resampy.resample(audio_data, 48000, 16000)
+	
+	#######################################################
+	## Step 1: Generate speech areas (work on input, because I am sure that the data is not modified
+	#######################################################
+	audio_data_absolute = np.absolute(audio_data)
+	shape = np.copy(audio_data_absolute)
+	# average of the anbiant noise
+	avg = 0
+	# count represent the number of basic sample to etalonate the generic ambiant noise
+	count = 0
+	count_all = 0
+	set_value = 32767
+	# number of sampel of the anbiant noise calculation
+	ambiant_basic_size = 48000/4 # : 1/4 s ==> the cormus is manage to have 1 second before starting to have real data voice
+	for sample_audio in audio_data_absolute:
+		#shape[count_all] = 0
+		count_all += 1
+		if count < ambiant_basic_size:
+			# Remove the 0 padding
+			if count == 0 and sample_audio == 0:
+				continue
+			elif count == 0:
+				debug.info("start data " + str(count_all) + " samples (at value 0)")
+			count += 1
+			avg += sample_audio
+		elif count == ambiant_basic_size:
+			count += 1
+			avg /= ambiant_basic_size
+			debug.info("basic ambiant is: " + str(avg) + " start annalyse at : " + str(count_all/48000) + " sec")
+			if avg <= 327.67:
+				avg = int(327.67)
+			else:
+				avg = int(avg * 1.1)
+			## set_value = avg
+			debug.info("    inspect at " + str(avg))
+		else:
+			if sample_audio >= avg:
+				shape[count_all-1] = set_value;
+	
+	#filename_out = os.path.join(args.output, input_data["audio_filename"] + "_ori.wav")
+	#scipy.io.wavfile.write(filename_out, 48000, audio_data)
+	#filename_out = os.path.join(args.output, input_data["audio_filename"] + "_abs.wav")
+	#scipy.io.wavfile.write(filename_out, 48000, audio_data_absolute)
+	#filename_out = os.path.join(args.output, input_data["audio_filename"] + "_shape1.wav")
+	#scipy.io.wavfile.write(filename_out, 48000, shape)
+	
+	windows = int(48000 * 0.02)
+	count_all = 0
+	count = 0
+	for sample_audio in shape:
+		count_all += 1
+		if sample_audio == set_value:
+			count = windows
+			continue
+		count -= 1
+		if count >= 0:
+			shape[count_all-1] = set_value;
+	shape = np.flip(shape)
+	count_all = 0
+	count = 0
+	for sample_audio in shape:
+		count_all += 1
+		if sample_audio == set_value:
+			count = windows
+			continue
+		count -= 1
+		if count >= 0:
+			shape[count_all-1] = set_value;
+		else:
+			shape[count_all-1] = 0
+	shape = np.flip(shape)
+	
+	filename_out = os.path.join(args.output, input_data["audio_filename"] + "_shape.wav")
+	scipy.io.wavfile.write(filename_out, 48000, shape)
+	
+	count_all = 0
+	table_voice_detected = [[0,False]]
+	previous = False
+	# use numpy slicing: [start:stop:step] https://docs.scipy.org/doc/numpy/reference/arrays.indexing.html#basic-slicing-and-indexing
+	for sample_audio in shape[3:len(shape):3]:
+		count_all += 1
+		if previous == False:
+			if sample_audio == set_value:
+				table_voice_detected.append([count_all, True])
+				previous = True
+		else:
+			if sample_audio == 0:
+				table_voice_detected.append([count_all, False])
+				previous = False
+	
+	#######################################################
+	## Step 2: Resample
+	#######################################################
+	audio_16k = resampy.resample(audio_data, 48000, 16000, filter='kaiser_best')
+	#audio_16k = resampy.resample(audio_data, 48000, 16000, filter='kaiser_fast')
 	
 	debug.info("write: " + str(len(audio_16k)) + " sample(s)")
 	
 	filename_out = os.path.join(args.output, input_data["audio_filename"])
-	wavfile.write(filename_out, 16000, audio_16k)
-	input_data["audio_sample_rate"] = 16000
+	scipy.io.wavfile.write(filename_out, 16000, audio_16k)
+	# create new data format:
+	output_data = {
+		"value": input_data["value"],
+		"language": input_data["language"],
+		"audio_sample_rate": 16000,
+		"audio_filename": input_data["audio_filename"],
+		"VAD": table_voice_detected,
+		"action": [
+			{
+				"type": "resampling",
+				"tool": "resampy",
+				"desc": "48000 ==> 16000",
+				"src": elem,
+			},{
+				"type": "auto VAD",
+				"tool": "internal_1",
+				"desc": "When control the input data, with small noise, we can detect voice, just with signal power ...",
+				"src": elem,
+			},
+		],
+	}
+	
+	
 	filename_out_json = os.path.join(args.output, os.path.basename(elem))
 	with open(filename_out_json, 'w') as outfile:
-		json.dump(input_data, outfile, indent="\t")
+		json.dump(output_data, outfile, indent="\t")
 
 
 debug.info("Finish")