Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fieldlists: Refactor command line arg processing, part 2 #294

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions number-lines/src/tsv_utils/number-lines.d
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,14 @@ void numberLines(const NumberLinesOptions cmdopt, const string[] inputFiles)
bufferedOutput.append(cmdopt.delim);
bufferedOutput.appendln(line);
headerWritten = true;

/* Flush the header immediately. This helps tasks further on in a
* unix pipeline detect errors quickly, without waiting for all
* the data to flow through the pipeline. Note that an upstream
* task may have flushed its header line, so the header may
* arrive long before the main block of data.
*/
bufferedOutput.flush;
}
}
else
Expand Down
27 changes: 18 additions & 9 deletions tsv-append/src/tsv_utils/tsv-append.d
Original file line number Diff line number Diff line change
Expand Up @@ -101,14 +101,12 @@ EOS";
struct TsvAppendOptions
{
string programName;
string[] files; // Input files
string[string] fileSourceNames; // Maps file path to the 'source' value
bool helpVerbose = false; // --help-verbose
string sourceHeader; // --s|source-header
bool trackSource = false; // --t|track-source
bool hasHeader = false; // --H|header
char delim = '\t'; // --d|delimiter
bool versionWanted = false; // --V|version
string[] files; /// Input files
string[string] fileSourceNames; /// Maps file path to the 'source' value
string sourceHeader; /// --s|source-header
bool trackSource = false; /// --t|track-source
bool hasHeader = false; /// --H|header
char delim = '\t'; /// --d|delimiter

/* fileOptionHandler processes the '--f|file source=file' option. */
private void fileOptionHandler(string option, string optionVal) pure @safe
Expand Down Expand Up @@ -144,6 +142,9 @@ struct TsvAppendOptions
import std.getopt;
import std.path : baseName, stripExtension;

bool helpVerbose = false; // --help-verbose
bool versionWanted = false; // --V|version

programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";

try
Expand Down Expand Up @@ -211,7 +212,7 @@ struct TsvAppendOptions
void tsvAppend(OutputRange)(TsvAppendOptions cmdopt, auto ref OutputRange outputStream)
if (isOutputRange!(OutputRange, char))
{
import tsv_utils.common.utils : bufferedByLine;
import tsv_utils.common.utils : bufferedByLine, isFlushableOutputRange;

bool headerWritten = false;
foreach (filename; (cmdopt.files.length > 0) ? cmdopt.files : ["-"])
Expand All @@ -232,6 +233,14 @@ if (isOutputRange!(OutputRange, char))
outputStream.put(line);
outputStream.put('\n');
headerWritten = true;

/* Flush the header immediately. This helps tasks further on in a
* unix pipeline detect errors quickly, without waiting for all
* the data to flow through the pipeline. Note that an upstream
* task may have flushed its header line, so the header may
* arrive long before the main block of data.
*/
static if (isFlushableOutputRange!OutputRange) outputStream.flush;
}
}
else
Expand Down
193 changes: 121 additions & 72 deletions tsv-summarize/src/tsv_utils/tsv-summarize.d
Original file line number Diff line number Diff line change
Expand Up @@ -148,21 +148,35 @@ EOS";
struct TsvSummarizeOptions {
import tsv_utils.common.utils : byLineSourceRange, ByLineSourceRange;

string programName; // Program name
ByLineSourceRange!() inputSources; // Input Files
size_t[] keyFields; // -g, --group-by
bool hasHeader = false; // --header
bool writeHeader = false; // -w, --write-header
char inputFieldDelimiter = '\t'; // --d|delimiter
char valuesDelimiter = '|'; // --v|values-delimiter
size_t floatPrecision = 12; // --p|float-precision
bool excludeMissing = false; // --x|exclude-missing
string missingValueReplacement; // --r|replace-missing
bool helpVerbose = false; // --help-verbose
bool versionWanted = false; // --V|version
DList!Operator operators; // Operators, in the order specified.
size_t endFieldIndex = 0; // Derived value. Max field index used plus one.
MissingFieldPolicy globalMissingPolicy = new MissingFieldPolicy; // Derived value.
string programName; /// Program name
ByLineSourceRange!() inputSources; /// Input Files
size_t[] keyFields; /// -g, --group-by
bool hasHeader = false; /// --header
bool writeHeader = false; /// -w, --write-header
char inputFieldDelimiter = '\t'; /// --d|delimiter
char valuesDelimiter = '|'; /// --v|values-delimiter
size_t floatPrecision = 12; /// --p|float-precision
DList!Operator operators; /// Operators, in the order specified.
size_t endFieldIndex = 0; /// Derived value. Max field index used plus one.
MissingFieldPolicy globalMissingPolicy = new MissingFieldPolicy; /// Derived value.

/* tsv-summarize operators require access to the header line when the operator is
* created. This is because named fields may be used to describe fields names. To
* enable this, a CmdOptionHandler delegate is added to the cmdLinOperatorOptions
* array during during initial processing by std.getopt. The group-by operation is
* similar, but is added to the cmdLineOtherFieldOptions instead. At least one
* cmdLineOperatorOptions entry is required.
*
* The different handlers are defined after processArgs.
*/

/* CmdOptionHandler delegate signature - This is the call made to process the command
* line option arguments after the header line has been read.
*/
alias CmdOptionHandler = void delegate(bool hasHeader, string[] headerFields);

private CmdOptionHandler[] cmdLineOperatorOptions;
private CmdOptionHandler[] cmdLineOtherFieldOptions;

/* Returns a tuple. First value is true if command line arguments were successfully
* processed and execution should continue, or false if an error occurred or the user
Expand All @@ -177,7 +191,13 @@ struct TsvSummarizeOptions {
import std.path : baseName, stripExtension;
import std.typecons : Yes, No;
import tsv_utils.common.getopt_inorder;
import tsv_utils.common.fieldlist : makeFieldListOptionHandler;
import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix;

bool helpVerbose = false; // --help-verbose
bool versionWanted = false; // --V|version
bool excludeMissing = false; // --x|exclude-missing
string missingValueReplacement; // --r|replace-missing


programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";

Expand Down Expand Up @@ -250,20 +270,60 @@ struct TsvSummarizeOptions {
*/
string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"];
cmdArgs.length = 1;
inputSources = byLineSourceRange(filepaths);

/* Validation and derivations - Do as much validation prior to header line
* processing as possible (avoids waiting on stdin).
*/

enforce(!cmdLineOperatorOptions.empty, "At least one summary operator is required.");

enforce(inputFieldDelimiter != valuesDelimiter,
"Cannot use the same character for both --d|field-delimiter and --v|values-delimiter.");

enforce(!(excludeMissing && missingValueReplacement.length != 0),
"Cannot use both '--x|exclude-missing' and '--r|replace-missing'.");

/* Missing field policy. */
globalMissingPolicy.updatePolicy(excludeMissing, missingValueReplacement);

string[] headerFields;

if (hasHeader && !inputSources.front.byLine.empty)
/* fieldListArgProcessing encapsulates the field list processing. It is
* called prior to reading the header line if headers are not being used,
* and after if headers are being used.
*/
void fieldListArgProcessing()
{
headerFields = inputSources.front.byLine.front.split(inputFieldDelimiter).to!(string[]);
/* Run all the operator handlers. */
cmdLineOtherFieldOptions.each!(dg => dg(hasHeader, headerFields));
cmdLineOperatorOptions.each!(dg => dg(hasHeader, headerFields));

/* keyFields need to be part of the endFieldIndex, which is one past
* the last field index. */
keyFields.each!(delegate (size_t x)
{
if (x >= endFieldIndex) endFieldIndex = x + 1;
} );
}

cmdLineOtherFieldOptions.each!(dg => dg(hasHeader, headerFields));
cmdLineOperatorOptions.each!(dg => dg(hasHeader, headerFields));
if (!hasHeader) fieldListArgProcessing();

/*
* Create the byLineSourceRange and perform header line processing.
*/
inputSources = byLineSourceRange(filepaths);


consistencyValidations();
derivations(); // After processing cmdLine[OtherField|Operator]Options.
if (hasHeader)
{
if (!inputSources.front.byLine.empty)
{
throwIfWindowsNewlineOnUnix(inputSources.front.byLine.front, inputSources.front.name, 1);
headerFields = inputSources.front.byLine.front.split(inputFieldDelimiter).to!(string[]);
}

fieldListArgProcessing();
}
}
catch (Exception exc)
{
Expand All @@ -273,15 +333,7 @@ struct TsvSummarizeOptions {
return tuple(true, 0);
}

/* CmdOptionHandler delegate signature - This is the call made to process the command
* line option arguments after the header line has been read.
*/
alias CmdOptionHandler = void delegate(bool hasHeader, string[] headerFields);

CmdOptionHandler[] cmdLineOperatorOptions;
CmdOptionHandler[] cmdLineOtherFieldOptions;

void addGroupByOptionHandler(string option, string optionVal)
private void addGroupByOptionHandler(string option, string optionVal)
{
cmdLineOtherFieldOptions ~=
(bool hasHeader, string[] headerFields)
Expand All @@ -306,7 +358,7 @@ struct TsvSummarizeOptions {
}
}

void addOperatorOptionHandler(OperatorClass : SingleFieldOperator)(string option, string optionVal)
private void addOperatorOptionHandler(OperatorClass : SingleFieldOperator)(string option, string optionVal)
{
cmdLineOperatorOptions ~=
(bool hasHeader, string[] headerFields)
Expand Down Expand Up @@ -367,7 +419,7 @@ struct TsvSummarizeOptions {
}
}

void addQuantileOperatorOptionHandler(string option, string optionVal)
private void addQuantileOperatorOptionHandler(string option, string optionVal)
{
cmdLineOperatorOptions ~=
(bool hasHeader, string[] headerFields)
Expand Down Expand Up @@ -438,7 +490,7 @@ struct TsvSummarizeOptions {

}

void addCountOptionHandler()
private void addCountOptionHandler()
{
cmdLineOperatorOptions ~=
(bool hasHeader, string[] headerFields)
Expand All @@ -450,7 +502,7 @@ struct TsvSummarizeOptions {
operators.insertBack(new CountOperator());
}

void addCountHeaderOptionHandler(string option, string optionVal)
private void addCountHeaderOptionHandler(string option, string optionVal)
{
cmdLineOperatorOptions ~=
(bool hasHeader, string[] headerFields)
Expand All @@ -463,59 +515,55 @@ struct TsvSummarizeOptions {
op.setCustomHeader(optionVal);
operators.insertBack(op);
}

/* This routine does validations not handled by processArgs. */
private void consistencyValidations()
{
enforce(!cmdLineOperatorOptions.empty, "At least one summary operator is required.");

enforce(inputFieldDelimiter != valuesDelimiter,
"Cannot use the same character for both --d|field-delimiter and --v|values-delimiter.");

enforce(!(excludeMissing && missingValueReplacement.length != 0),
"Cannot use both '--x|exclude-missing' and '--r|replace-missing'.");
}

/* Post-processing derivations. */
void derivations()
{
/* keyFields need to part of the endFieldIndex, which is one past the last field index. */
keyFields.each!(delegate (size_t x) { if (x >= endFieldIndex) endFieldIndex = x + 1; } );

/* Missing field policy. */
globalMissingPolicy.updatePolicy(excludeMissing, missingValueReplacement);
}
}

/** tsvSummarize does the primary work of the tsv-summarize program.
*/
void tsvSummarize(ref TsvSummarizeOptions cmdopt)
{
import tsv_utils.common.utils : ByLineSourceRange, bufferedByLine,
throwIfWindowsNewlineOnUnix;
import tsv_utils.common.utils : BufferedOutputRange, ByLineSourceRange,
bufferedByLine, throwIfWindowsNewlineOnUnix;

/* Check that the input files were setup as expected. Should at least have one
* input, stdin if nothing else, and newlines removed from the byLine range.
*/
assert(!cmdopt.inputSources.empty);
static assert(is(typeof(cmdopt.inputSources) == ByLineSourceRange!(No.keepTerminator)));

/* BufferedOutputRange is faster than writing directly to stdout if many lines are
* being written. This will happen mostly when group-by is used.
*/
auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout);

/* Pick the Summarizer based on the number of key-fields entered. */
auto summarizer =
(cmdopt.keyFields.length == 0)
? new NoKeySummarizer!(typeof(stdout.lockingTextWriter()))(
? new NoKeySummarizer!(typeof(bufferedOutput))(
cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy)

: (cmdopt.keyFields.length == 1)
? new OneKeySummarizer!(typeof(stdout.lockingTextWriter()))(
? new OneKeySummarizer!(typeof(bufferedOutput))(
cmdopt.keyFields[0], cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy)

: new MultiKeySummarizer!(typeof(stdout.lockingTextWriter()))(
: new MultiKeySummarizer!(typeof(bufferedOutput))(
cmdopt.keyFields, cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy);

/* Add the operators to the Summarizer. */
summarizer.setOperators(inputRangeObject(cmdopt.operators[]));

/* If there's no input header line, but writing an output header anyway, then
* write it now. This helps tasks further on in a unix pipeline detect errors
* quickly, without waiting for all the data to flow through the pipeline.
*/
auto printOptions = SummarizerPrintOptions(
cmdopt.inputFieldDelimiter, cmdopt.valuesDelimiter, cmdopt.floatPrecision);

if (!cmdopt.hasHeader && cmdopt.writeHeader)
{
summarizer.writeSummaryHeader(bufferedOutput, printOptions);
bufferedOutput.flush;
}

/* Process each input file, one line at a time. */
auto lineFields = new char[][](cmdopt.endFieldIndex);
bool headerFound = false;
Expand Down Expand Up @@ -567,6 +615,15 @@ void tsvSummarize(ref TsvSummarizeOptions cmdopt)
{
summarizer.processHeaderLine(lineFields);
headerFound = true;

/* Write the header now. This helps tasks further on in a unix
* pipeline detect errors quickly, without waiting for all the
* data to flow through the pipeline. Note that an upstream task
* may have flushed its header line, so the header may arrive
* long before the main block of data.
*/
summarizer.writeSummaryHeader(bufferedOutput, printOptions);
bufferedOutput.flush;
}
}
else
Expand All @@ -589,16 +646,8 @@ void tsvSummarize(ref TsvSummarizeOptions cmdopt)
debug writeln("[tsvSummarize] After reading all data.");

/* Whew! We're done processing input data. Run the calculations and print. */
auto printOptions = SummarizerPrintOptions(
cmdopt.inputFieldDelimiter, cmdopt.valuesDelimiter, cmdopt.floatPrecision);
auto stdoutWriter = stdout.lockingTextWriter;

if (cmdopt.hasHeader || cmdopt.writeHeader)
{
summarizer.writeSummaryHeader(stdoutWriter, printOptions);
}

summarizer.writeSummaryBody(stdoutWriter, printOptions);
summarizer.writeSummaryBody(bufferedOutput, printOptions);
}

/** The default field header. This is used when the input doesn't have field headers,
Expand Down
Loading