eBay · jondegenhardt · Jun 17, 2020 · Jun 17, 2020 · Jun 17, 2020
diff --git a/number-lines/src/tsv_utils/number-lines.d b/number-lines/src/tsv_utils/number-lines.d
@@ -150,6 +150,14 @@ void numberLines(const NumberLinesOptions cmdopt, const string[] inputFiles)
                     bufferedOutput.append(cmdopt.delim);
                     bufferedOutput.appendln(line);
                     headerWritten = true;
+
+                    /* Flush the header immediately. This helps tasks further on in a
+                     * unix pipeline detect errors quickly, without waiting for all
+                     * the data to flow through the pipeline. Note that an upstream
+                     * task may have flushed its header line, so the header may
+                     * arrive long before the main block of data.
+                     */
+                    bufferedOutput.flush;
                 }
             }
             else

diff --git a/tsv-append/src/tsv_utils/tsv-append.d b/tsv-append/src/tsv_utils/tsv-append.d
@@ -101,14 +101,12 @@ EOS";
 struct TsvAppendOptions
 {
     string programName;
-    string[] files;                    // Input files
-    string[string] fileSourceNames;    // Maps file path to the 'source' value
-    bool helpVerbose = false;          // --help-verbose
-    string sourceHeader;               // --s|source-header
-    bool trackSource = false;          // --t|track-source
-    bool hasHeader = false;            // --H|header
-    char delim = '\t';                 // --d|delimiter
-    bool versionWanted = false;        // --V|version
+    string[] files;                    /// Input files
+    string[string] fileSourceNames;    /// Maps file path to the 'source' value
+    string sourceHeader;               /// --s|source-header
+    bool trackSource = false;          /// --t|track-source
+    bool hasHeader = false;            /// --H|header
+    char delim = '\t';                 /// --d|delimiter
 
     /* fileOptionHandler processes the '--f|file source=file' option. */
     private void fileOptionHandler(string option, string optionVal) pure @safe
@@ -144,6 +142,9 @@ struct TsvAppendOptions
         import std.getopt;
         import std.path : baseName, stripExtension;
 
+        bool helpVerbose = false;          // --help-verbose
+        bool versionWanted = false;        // --V|version
+
         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
 
         try
@@ -211,7 +212,7 @@ struct TsvAppendOptions
 void tsvAppend(OutputRange)(TsvAppendOptions cmdopt, auto ref OutputRange outputStream)
 if (isOutputRange!(OutputRange, char))
 {
-    import tsv_utils.common.utils : bufferedByLine;
+    import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange;
 
     bool headerWritten = false;
     foreach (filename; (cmdopt.files.length > 0) ? cmdopt.files : ["-"])
@@ -232,6 +233,14 @@ if (isOutputRange!(OutputRange, char))
                     outputStream.put(line);
                     outputStream.put('\n');
                     headerWritten = true;
+
+                    /* Flush the header immediately. This helps tasks further on in a
+                     * unix pipeline detect errors quickly, without waiting for all
+                     * the data to flow through the pipeline. Note that an upstream
+                     * task may have flushed its header line, so the header may
+                     * arrive long before the main block of data.
+                     */
+                    static if (isFlushableOutputRange!OutputRange) outputStream.flush;
                 }
             }
             else

diff --git a/tsv-summarize/src/tsv_utils/tsv-summarize.d b/tsv-summarize/src/tsv_utils/tsv-summarize.d
@@ -148,21 +148,35 @@ EOS";
 struct TsvSummarizeOptions {
     import tsv_utils.common.utils : byLineSourceRange, ByLineSourceRange;
 
-    string programName;                // Program name
-    ByLineSourceRange!() inputSources; // Input Files
-    size_t[] keyFields;                // -g, --group-by
-    bool hasHeader = false;            // --header
-    bool writeHeader = false;          // -w, --write-header
-    char inputFieldDelimiter = '\t';   // --d|delimiter
-    char valuesDelimiter = '|';        // --v|values-delimiter
-    size_t floatPrecision = 12;        // --p|float-precision
-    bool excludeMissing = false;       // --x|exclude-missing
-    string missingValueReplacement;    // --r|replace-missing
-    bool helpVerbose = false;          // --help-verbose
-    bool versionWanted = false;        // --V|version
-    DList!Operator operators;          // Operators, in the order specified.
-    size_t endFieldIndex = 0;          // Derived value. Max field index used plus one.
-    MissingFieldPolicy globalMissingPolicy = new MissingFieldPolicy;   // Derived value.
+    string programName;                /// Program name
+    ByLineSourceRange!() inputSources; /// Input Files
+    size_t[] keyFields;                /// -g, --group-by
+    bool hasHeader = false;            /// --header
+    bool writeHeader = false;          /// -w, --write-header
+    char inputFieldDelimiter = '\t';   /// --d|delimiter
+    char valuesDelimiter = '|';        /// --v|values-delimiter
+    size_t floatPrecision = 12;        /// --p|float-precision
+    DList!Operator operators;          /// Operators, in the order specified.
+    size_t endFieldIndex = 0;          /// Derived value. Max field index used plus one.
+    MissingFieldPolicy globalMissingPolicy = new MissingFieldPolicy;   /// Derived value.
+
+    /* tsv-summarize operators require access to the header line when the operator is
+     * created. This is because named fields may be used to describe fields names. To
+     * enable this, a CmdOptionHandler delegate is added to the cmdLinOperatorOptions
+     * array during during initial processing by std.getopt. The group-by operation is
+     * similar, but is added to the cmdLineOtherFieldOptions instead. At least one
+     * cmdLineOperatorOptions entry is required.
+     *
+     * The different handlers are defined after processArgs.
+     */
+
+    /* CmdOptionHandler delegate signature - This is the call made to process the command
+     * line option arguments after the header line has been read.
+     */
+    alias CmdOptionHandler = void delegate(bool hasHeader, string[] headerFields);
+
+    private CmdOptionHandler[]  cmdLineOperatorOptions;
+    private CmdOptionHandler[]  cmdLineOtherFieldOptions;
 
     /* Returns a tuple. First value is true if command line arguments were successfully
      * processed and execution should continue, or false if an error occurred or the user
@@ -177,7 +191,13 @@ struct TsvSummarizeOptions {
         import std.path : baseName, stripExtension;
         import std.typecons : Yes, No;
         import tsv_utils.common.getopt_inorder;
-        import tsv_utils.common.fieldlist :  makeFieldListOptionHandler;
+        import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix;
+
+        bool helpVerbose = false;          // --help-verbose
+        bool versionWanted = false;        // --V|version
+        bool excludeMissing = false;       // --x|exclude-missing
+        string missingValueReplacement;    // --r|replace-missing
+
 
         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
 
@@ -250,20 +270,60 @@ struct TsvSummarizeOptions {
              */
             string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"];
             cmdArgs.length = 1;
-            inputSources = byLineSourceRange(filepaths);
+
+            /* Validation and derivations - Do as much validation prior to header line
+             * processing as possible (avoids waiting on stdin).
+             */
+
+            enforce(!cmdLineOperatorOptions.empty, "At least one summary operator is required.");
+
+            enforce(inputFieldDelimiter != valuesDelimiter,
+                    "Cannot use the same character for both --d|field-delimiter and --v|values-delimiter.");
+
+            enforce(!(excludeMissing && missingValueReplacement.length != 0),
+                    "Cannot use both '--x|exclude-missing' and '--r|replace-missing'.");
+
+            /* Missing field policy. */
+            globalMissingPolicy.updatePolicy(excludeMissing, missingValueReplacement);
 
             string[] headerFields;
 
-            if (hasHeader && !inputSources.front.byLine.empty)
+            /* fieldListArgProcessing encapsulates the field list processing. It is
+             * called prior to reading the header line if headers are not being used,
+             * and after if headers are being used.
+             */
+            void fieldListArgProcessing()
             {
-                headerFields = inputSources.front.byLine.front.split(inputFieldDelimiter).to!(string[]);
+                /* Run all the operator handlers. */
+                cmdLineOtherFieldOptions.each!(dg => dg(hasHeader, headerFields));
+                cmdLineOperatorOptions.each!(dg => dg(hasHeader, headerFields));
+
+                /* keyFields need to be part of the endFieldIndex, which is one past
+                 * the last field index. */
+                keyFields.each!(delegate (size_t x)
+                                {
+                                    if (x >= endFieldIndex) endFieldIndex = x + 1;
+                                } );
             }
 
-            cmdLineOtherFieldOptions.each!(dg => dg(hasHeader, headerFields));
-            cmdLineOperatorOptions.each!(dg => dg(hasHeader, headerFields));
+            if (!hasHeader) fieldListArgProcessing();
+
+            /*
+             * Create the byLineSourceRange and perform header line processing.
+             */
+            inputSources = byLineSourceRange(filepaths);
+
 
-            consistencyValidations();
-            derivations(); // After processing cmdLine[OtherField|Operator]Options.
+            if (hasHeader)
+            {
+                if (!inputSources.front.byLine.empty)
+                {
+                    throwIfWindowsNewlineOnUnix(inputSources.front.byLine.front, inputSources.front.name, 1);
+                    headerFields = inputSources.front.byLine.front.split(inputFieldDelimiter).to!(string[]);
+                }
+
+                fieldListArgProcessing();
+            }
         }
         catch (Exception exc)
         {
@@ -273,15 +333,7 @@ struct TsvSummarizeOptions {
         return tuple(true, 0);
     }
 
-    /* CmdOptionHandler delegate signature - This is the call made to process the command
-     * line option arguments after the header line has been read.
-     */
-    alias CmdOptionHandler = void delegate(bool hasHeader, string[] headerFields);
-
-    CmdOptionHandler[]  cmdLineOperatorOptions;
-    CmdOptionHandler[]  cmdLineOtherFieldOptions;
-
-    void addGroupByOptionHandler(string option, string optionVal)
+    private void addGroupByOptionHandler(string option, string optionVal)
     {
         cmdLineOtherFieldOptions ~=
             (bool hasHeader, string[] headerFields)
@@ -306,7 +358,7 @@ struct TsvSummarizeOptions {
         }
     }
 
-    void addOperatorOptionHandler(OperatorClass : SingleFieldOperator)(string option, string optionVal)
+    private void addOperatorOptionHandler(OperatorClass : SingleFieldOperator)(string option, string optionVal)
     {
         cmdLineOperatorOptions ~=
             (bool hasHeader, string[] headerFields)
@@ -367,7 +419,7 @@ struct TsvSummarizeOptions {
         }
     }
 
-    void addQuantileOperatorOptionHandler(string option, string optionVal)
+    private void addQuantileOperatorOptionHandler(string option, string optionVal)
     {
         cmdLineOperatorOptions ~=
             (bool hasHeader, string[] headerFields)
@@ -438,7 +490,7 @@ struct TsvSummarizeOptions {
 
     }
 
-    void addCountOptionHandler()
+    private void addCountOptionHandler()
     {
         cmdLineOperatorOptions ~=
             (bool hasHeader, string[] headerFields)
@@ -450,7 +502,7 @@ struct TsvSummarizeOptions {
         operators.insertBack(new CountOperator());
     }
 
-    void addCountHeaderOptionHandler(string option, string optionVal)
+   private  void addCountHeaderOptionHandler(string option, string optionVal)
     {
         cmdLineOperatorOptions ~=
             (bool hasHeader, string[] headerFields)
@@ -463,59 +515,55 @@ struct TsvSummarizeOptions {
         op.setCustomHeader(optionVal);
         operators.insertBack(op);
     }
-
-    /* This routine does validations not handled by processArgs. */
-    private void consistencyValidations()
-    {
-        enforce(!cmdLineOperatorOptions.empty, "At least one summary operator is required.");
-
-        enforce(inputFieldDelimiter != valuesDelimiter,
-                "Cannot use the same character for both --d|field-delimiter and --v|values-delimiter.");
-
-        enforce(!(excludeMissing && missingValueReplacement.length != 0),
-                "Cannot use both '--x|exclude-missing' and '--r|replace-missing'.");
-    }
-
-    /* Post-processing derivations. */
-    void derivations()
-    {
-        /* keyFields need to part of the endFieldIndex, which is one past the last field index. */
-        keyFields.each!(delegate (size_t x) { if (x >= endFieldIndex) endFieldIndex = x + 1; } );
-
-        /* Missing field policy. */
-        globalMissingPolicy.updatePolicy(excludeMissing, missingValueReplacement);
-    }
 }
 
 /** tsvSummarize does the primary work of the tsv-summarize program.
  */
 void tsvSummarize(ref TsvSummarizeOptions cmdopt)
 {
-    import tsv_utils.common.utils : ByLineSourceRange, bufferedByLine,
-        throwIfWindowsNewlineOnUnix;
+    import tsv_utils.common.utils : BufferedOutputRange, ByLineSourceRange,
+        bufferedByLine, throwIfWindowsNewlineOnUnix;
 
     /* Check that the input files were setup as expected. Should at least have one
      * input, stdin if nothing else, and newlines removed from the byLine range.
      */
     assert(!cmdopt.inputSources.empty);
     static assert(is(typeof(cmdopt.inputSources) == ByLineSourceRange!(No.keepTerminator)));
 
+    /* BufferedOutputRange is faster than writing directly to stdout if many lines are
+     * being written. This will happen mostly when group-by is used.
+     */
+    auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout);
+
     /* Pick the Summarizer based on the number of key-fields entered. */
     auto summarizer =
         (cmdopt.keyFields.length == 0)
-        ? new NoKeySummarizer!(typeof(stdout.lockingTextWriter()))(
+        ? new NoKeySummarizer!(typeof(bufferedOutput))(
             cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy)
 
         : (cmdopt.keyFields.length == 1)
-        ? new OneKeySummarizer!(typeof(stdout.lockingTextWriter()))(
+        ? new OneKeySummarizer!(typeof(bufferedOutput))(
             cmdopt.keyFields[0], cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy)
 
-        : new MultiKeySummarizer!(typeof(stdout.lockingTextWriter()))(
+        : new MultiKeySummarizer!(typeof(bufferedOutput))(
             cmdopt.keyFields, cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy);
 
     /* Add the operators to the Summarizer. */
     summarizer.setOperators(inputRangeObject(cmdopt.operators[]));
 
+    /* If there's no input header line, but writing an output header anyway, then
+     * write it now. This helps tasks further on in a unix pipeline detect errors
+     * quickly, without waiting for all the data to flow through the pipeline.
+     */
+    auto printOptions = SummarizerPrintOptions(
+        cmdopt.inputFieldDelimiter, cmdopt.valuesDelimiter, cmdopt.floatPrecision);
+
+    if (!cmdopt.hasHeader && cmdopt.writeHeader)
+    {
+        summarizer.writeSummaryHeader(bufferedOutput, printOptions);
+        bufferedOutput.flush;
+    }
+
     /* Process each input file, one line at a time. */
     auto lineFields = new char[][](cmdopt.endFieldIndex);
     bool headerFound = false;
@@ -567,6 +615,15 @@ void tsvSummarize(ref TsvSummarizeOptions cmdopt)
                 {
                     summarizer.processHeaderLine(lineFields);
                     headerFound = true;
+
+                    /* Write the header now. This helps tasks further on in a unix
+                     * pipeline detect errors quickly, without waiting for all the
+                     * data to flow through the pipeline. Note that an upstream task
+                     * may have flushed its header line, so the header may arrive
+                     * long before the main block of data.
+                     */
+                    summarizer.writeSummaryHeader(bufferedOutput, printOptions);
+                    bufferedOutput.flush;
                 }
             }
             else
@@ -589,16 +646,8 @@ void tsvSummarize(ref TsvSummarizeOptions cmdopt)
     debug writeln("[tsvSummarize] After reading all data.");
 
     /* Whew! We're done processing input data. Run the calculations and print. */
-    auto printOptions = SummarizerPrintOptions(
-        cmdopt.inputFieldDelimiter, cmdopt.valuesDelimiter, cmdopt.floatPrecision);
-    auto stdoutWriter = stdout.lockingTextWriter;
-
-    if (cmdopt.hasHeader || cmdopt.writeHeader)
-    {
-        summarizer.writeSummaryHeader(stdoutWriter, printOptions);
-    }
 
-    summarizer.writeSummaryBody(stdoutWriter, printOptions);
+    summarizer.writeSummaryBody(bufferedOutput, printOptions);
 }
 
 /** The default field header. This is used when the input doesn't have field headers,