PaddlePaddle · xinghai-sun · Aug 7, 2017 · Aug 2, 2017 · Aug 3, 2017 · Aug 3, 2017
diff --git a/deep_speech_2/README.md b/deep_speech_2/README.md
@@ -143,3 +143,28 @@ python tune.py --help
 ```
 
 Then reset parameters with the tuning result before inference or evaluating.
+
+### Playing with the ASR Demo
+
+A real-time ASR demo is built for users to try out the ASR model with their own voice. Please do the following installation on the machine you'd like to run the demo's client (no need for the machine running the demo's server).
+
+For example, on MAC OS X:
+
+```
+brew install portaudio
+pip install pyaudio
+pip install pynput
+```
+After a model and language model is prepared, we can first start the demo's server:
+
+```
+CUDA_VISIBLE_DEVICES=0 python demo_server.py
+```
+And then in another console, start the demo's client:
+
+```
+python demo_client.py
+```
+On the client console, press and hold the "white-space" key on the keyboard to start talking, until you finish your speech and then release the "white-space" key. The decoding results (infered transcription) will be displayed.
+
+It could be possible to start the server and the client in two seperate machines, e.g. `demo_client.py` is usually started in a machine with a microphone hardware, while `demo_server.py` is usually started in a remote server with powerful GPUs. Please first make sure that these two machines have network access to each other, and then use `--host_ip` and `--host_port` to indicate the server machine's actual IP address (instead of the `localhost` as default) and TCP port, in both `demo_server.py` and `demo_client.py`.
diff --git a/deep_speech_2/data_utils/data.py b/deep_speech_2/data_utils/data.py
@@ -83,6 +83,23 @@ def __init__(self,
         self._rng = random.Random(random_seed)
         self._epoch = 0
 
+    def process_utterance(self, filename, transcript):
+        """Load, augment, featurize and normalize for speech data.
+
+        :param filename: Audio filepath
+        :type filename: basestring
+        :param transcript: Transcription text.
+        :type transcript: basestring
+        :return: Tuple of audio feature tensor and list of token ids for
+                 transcription. 
+        :rtype: tuple of (2darray, list)
+        """
+        speech_segment = SpeechSegment.from_file(filename, transcript)
+        self._augmentation_pipeline.transform_audio(speech_segment)
+        specgram, text_ids = self._speech_featurizer.featurize(speech_segment)
+        specgram = self._normalizer.apply(specgram)
+        return specgram, text_ids
+
     def batch_reader_creator(self,
                              manifest_path,
                              batch_size,
@@ -198,14 +215,6 @@ def vocab_list(self):
         """
         return self._speech_featurizer.vocab_list
 
-    def _process_utterance(self, filename, transcript):
-        """Load, augment, featurize and normalize for speech data."""
-        speech_segment = SpeechSegment.from_file(filename, transcript)
-        self._augmentation_pipeline.transform_audio(speech_segment)
-        specgram, text_ids = self._speech_featurizer.featurize(speech_segment)
-        specgram = self._normalizer.apply(specgram)
-        return specgram, text_ids
-
     def _instance_reader_creator(self, manifest):
         """
         Instance reader creator. Create a callable function to produce
@@ -220,8 +229,8 @@ def reader():
                 yield instance
 
         def mapper(instance):
-            return self._process_utterance(instance["audio_filepath"],
-                                           instance["text"])
+            return self.process_utterance(instance["audio_filepath"],
+                                          instance["text"])
 
         return paddle.reader.xmap_readers(
             mapper, reader, self._num_threads, 1024, order=True)

diff --git a/deep_speech_2/demo_client.py b/deep_speech_2/demo_client.py
@@ -0,0 +1,94 @@
+"""Client-end for the ASR demo."""
+from pynput import keyboard
+import struct
+import socket
+import sys
+import argparse
+import pyaudio
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+    "--host_ip",
+    default="localhost",
+    type=str,
+    help="Server IP address. (default: %(default)s)")
+parser.add_argument(
+    "--host_port",
+    default=8086,
+    type=int,
+    help="Server Port. (default: %(default)s)")
+args = parser.parse_args()
+
+is_recording = False
+enable_trigger_record = True
+
+
+def on_press(key):
+    """On-press keyboard callback function."""
+    global is_recording, enable_trigger_record
+    if key == keyboard.Key.space:
+        if (not is_recording) and enable_trigger_record:
+            sys.stdout.write("Start Recording ... ")
+            sys.stdout.flush()
+            is_recording = True
+
+
+def on_release(key):
+    """On-release keyboard callback function."""
+    global is_recording, enable_trigger_record
+    if key == keyboard.Key.esc:
+        return False
+    elif key == keyboard.Key.space:
+        if is_recording == True:
+            is_recording = False
+
+
+data_list = []
+
+
+def callback(in_data, frame_count, time_info, status):
+    """Audio recorder's stream callback function."""
+    global data_list, is_recording, enable_trigger_record
+    if is_recording:
+        data_list.append(in_data)
+        enable_trigger_record = False
+    elif len(data_list) > 0:
+        # Connect to server and send data
+        sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+        sock.connect((args.host_ip, args.host_port))
+        sent = ''.join(data_list)
+        sock.sendall(struct.pack('>i', len(sent)) + sent)
+        print('Speech[length=%d] Sent.' % len(sent))
+        # Receive data from the server and shut down
+        received = sock.recv(1024)
+        print "Recognition Results: {}".format(received)
+        sock.close()
+        data_list = []
+    enable_trigger_record = True
+    return (in_data, pyaudio.paContinue)
+
+
+def main():
+    # prepare audio recorder
+    p = pyaudio.PyAudio()
+    stream = p.open(
+        format=pyaudio.paInt32,
+        channels=1,
+        rate=16000,
+        input=True,
+        stream_callback=callback)
+    stream.start_stream()
+
+    # prepare keyboard listener
+    with keyboard.Listener(
+            on_press=on_press, on_release=on_release) as listener:
+        listener.join()
+
+    # close up
+    stream.stop_stream()
+    stream.close()
+    p.terminate()
+
+
+if __name__ == "__main__":
+    main()