-
Notifications
You must be signed in to change notification settings - Fork 2.9k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add a realtime ASR demo (both server and client) for DS2 users to try with own voice. #186
Changes from all commits
ae84c6f
aee3e11
a40c622
d923a93
9955f05
b9f89fa
2cf6e7a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -83,6 +83,23 @@ def __init__(self, | |
self._rng = random.Random(random_seed) | ||
self._epoch = 0 | ||
|
||
def process_utterance(self, filename, transcript): | ||
"""Load, augment, featurize and normalize for speech data. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. """换一行再写注释,下同 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The practice of
follows the Google Coding Style for Python. And we keep this style consistent throughout the whole DS2 project. |
||
|
||
:param filename: Audio filepath | ||
:type filename: basestring | ||
:param transcript: Transcription text. | ||
:type transcript: basestring | ||
:return: Tuple of audio feature tensor and list of token ids for | ||
transcription. | ||
:rtype: tuple of (2darray, list) | ||
""" | ||
speech_segment = SpeechSegment.from_file(filename, transcript) | ||
self._augmentation_pipeline.transform_audio(speech_segment) | ||
specgram, text_ids = self._speech_featurizer.featurize(speech_segment) | ||
specgram = self._normalizer.apply(specgram) | ||
return specgram, text_ids | ||
|
||
def batch_reader_creator(self, | ||
manifest_path, | ||
batch_size, | ||
|
@@ -198,14 +215,6 @@ def vocab_list(self): | |
""" | ||
return self._speech_featurizer.vocab_list | ||
|
||
def _process_utterance(self, filename, transcript): | ||
"""Load, augment, featurize and normalize for speech data.""" | ||
speech_segment = SpeechSegment.from_file(filename, transcript) | ||
self._augmentation_pipeline.transform_audio(speech_segment) | ||
specgram, text_ids = self._speech_featurizer.featurize(speech_segment) | ||
specgram = self._normalizer.apply(specgram) | ||
return specgram, text_ids | ||
|
||
def _instance_reader_creator(self, manifest): | ||
""" | ||
Instance reader creator. Create a callable function to produce | ||
|
@@ -220,8 +229,8 @@ def reader(): | |
yield instance | ||
|
||
def mapper(instance): | ||
return self._process_utterance(instance["audio_filepath"], | ||
instance["text"]) | ||
return self.process_utterance(instance["audio_filepath"], | ||
instance["text"]) | ||
|
||
return paddle.reader.xmap_readers( | ||
mapper, reader, self._num_threads, 1024, order=True) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,94 @@ | ||
"""Client-end for the ASR demo.""" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 开头这里用 # 来注释,可参考layers.py的注释方式。下同。 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In layers.py, # is only used for copyrights declaration. |
||
from pynput import keyboard | ||
import struct | ||
import socket | ||
import sys | ||
import argparse | ||
import pyaudio | ||
|
||
parser = argparse.ArgumentParser(description=__doc__) | ||
parser.add_argument( | ||
"--host_ip", | ||
default="localhost", | ||
type=str, | ||
help="Server IP address. (default: %(default)s)") | ||
parser.add_argument( | ||
"--host_port", | ||
default=8086, | ||
type=int, | ||
help="Server Port. (default: %(default)s)") | ||
args = parser.parse_args() | ||
|
||
is_recording = False | ||
enable_trigger_record = True | ||
|
||
|
||
def on_press(key): | ||
"""On-press keyboard callback function.""" | ||
global is_recording, enable_trigger_record | ||
if key == keyboard.Key.space: | ||
if (not is_recording) and enable_trigger_record: | ||
sys.stdout.write("Start Recording ... ") | ||
sys.stdout.flush() | ||
is_recording = True | ||
|
||
|
||
def on_release(key): | ||
"""On-release keyboard callback function.""" | ||
global is_recording, enable_trigger_record | ||
if key == keyboard.Key.esc: | ||
return False | ||
elif key == keyboard.Key.space: | ||
if is_recording == True: | ||
is_recording = False | ||
|
||
|
||
data_list = [] | ||
|
||
|
||
def callback(in_data, frame_count, time_info, status): | ||
"""Audio recorder's stream callback function.""" | ||
global data_list, is_recording, enable_trigger_record | ||
if is_recording: | ||
data_list.append(in_data) | ||
enable_trigger_record = False | ||
elif len(data_list) > 0: | ||
# Connect to server and send data | ||
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) | ||
sock.connect((args.host_ip, args.host_port)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it's better to connect at first and reusing the connection rather than connecting every time when sending messages. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Opening a connection only cost several milliseconds. Besides, opening an independent connection for each utterance simplifies the codes (otherwise, in the server side, we have to handle multiple utterances with while-loop in a single handle() call). |
||
sent = ''.join(data_list) | ||
sock.sendall(struct.pack('>i', len(sent)) + sent) | ||
print('Speech[length=%d] Sent.' % len(sent)) | ||
# Receive data from the server and shut down | ||
received = sock.recv(1024) | ||
print "Recognition Results: {}".format(received) | ||
sock.close() | ||
data_list = [] | ||
enable_trigger_record = True | ||
return (in_data, pyaudio.paContinue) | ||
|
||
|
||
def main(): | ||
# prepare audio recorder | ||
p = pyaudio.PyAudio() | ||
stream = p.open( | ||
format=pyaudio.paInt32, | ||
channels=1, | ||
rate=16000, | ||
input=True, | ||
stream_callback=callback) | ||
stream.start_stream() | ||
|
||
# prepare keyboard listener | ||
with keyboard.Listener( | ||
on_press=on_press, on_release=on_release) as listener: | ||
listener.join() | ||
|
||
# close up | ||
stream.stop_stream() | ||
stream.close() | ||
p.terminate() | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
transcript -> transcription, they have different meanings.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Both of transcription and transcript could refer to the text contents of speech. I use "transcription" for docs and "transcript" for code variables (due to the shorter length).