Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Experimental named field support in tsv-join #288

Merged
merged 5 commits into from
Jun 8, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions common/src/tsv_utils/common/fieldlist.d
Original file line number Diff line number Diff line change
Expand Up @@ -241,8 +241,12 @@ alias ConsumeEntireFieldListString = Flag!"consumeEntireFieldListString";

The optional `cmdOptionString` and `headerCmdArg` arguments are used to generate better
error messages. `cmdOptionString` should be the command line arguments string passed to
`std.getopt`. e.g `"f|field"`. The `headerCmdArg` argument should be the option for
turning on header line processing. Most tsv-utils tools can use the default value.
`std.getopt`. e.g `"f|field"`. This is added to the error message. Callers already
adding the option name to the error message should pass the empty string.

The `headerCmdArg` argument should be the option for turning on header line processing.
This is standard for tsv-utils tools (`--H|header`), so most tsv-utils tools will use
the default value.

`parseFieldList` returns a reference range. This is so the `consumed` member function
remains valid when using the range with facilities that would copy a value-based
Expand Down Expand Up @@ -1390,7 +1394,7 @@ private auto namedFieldGroupToRegex(const char[] fieldGroup)
if (g[0] == HYPHEN)
{
enforce(!hyphenSeparatorFound && regexString.data.length != 0,
format("Hyphens in field names must be backslash escaped unless separating two field names: '%s'.\n",
format("Hyphens in field names must be backslash escaped unless separating two field names: '%s'.",
fieldGroup));

assert(field1Regex.empty);
Expand Down Expand Up @@ -1418,7 +1422,7 @@ private auto namedFieldGroupToRegex(const char[] fieldGroup)
}
}
enforce(!hyphenSeparatorFound || regexString.data.length != 0,
format("Hyphens in field names must be backslash escaped unless separating two field names: '%s'.\n",
format("Hyphens in field names must be backslash escaped unless separating two field names: '%s'.",
fieldGroup));

if (!hyphenSeparatorFound)
Expand Down
99 changes: 85 additions & 14 deletions tsv-join/src/tsv_utils/tsv-join.d
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ module tsv_utils.tsv_join;
import std.exception : enforce;
import std.stdio;
import std.format : format;
import std.range;
import std.typecons : tuple;

auto helpText = q"EOS
Expand Down Expand Up @@ -70,12 +71,12 @@ struct TsvJoinOptions
/* Data available the main program. Variables used only command line argument
* processing are local to processArgs.
*/
string programName;
InputSourceRange inputSources; // Input Files
ByLineSourceRange!() filterSource; // Derived: --filter
size_t[] keyFields; // --key-fields
size_t[] dataFields; // --data-fields
size_t[] appendFields; // --append-fields
string programName; /// Program name
InputSourceRange inputSources; /// Input Files
ByLineSourceRange!() filterSource; /// Derived: --filter
size_t[] keyFields; /// Derived: --key-fields
size_t[] dataFields; /// Derived: --data-fields
size_t[] appendFields; /// Derived: --append-fields
bool hasHeader = false; // --H|header
string appendHeaderPrefix = ""; // --append-header-prefix
bool writeAll = false; // --write-all
Expand All @@ -97,14 +98,23 @@ struct TsvJoinOptions
*/
auto processArgs (ref string[] cmdArgs)
{
import std.array : split;
import std.conv : to;
import std.getopt;
import std.path : baseName, stripExtension;
import std.typecons : Yes, No;
import tsv_utils.common.fieldlist : makeFieldListOptionHandler;
import tsv_utils.common.fieldlist;

string filterFile; // --filter
bool helpVerbose = false; // --help-verbose
bool versionWanted = false; // --V|version
string filterFile; // --filter
string keyFieldsArg; // --key-fields
string dataFieldsArg; // --data-fields
string appendFieldsArg; // --append-fields

string keyFieldsOptionString = "k|key-fields";
string dataFieldsOptionString = "d|data-fields";
string appendFieldsOptionString = "a|append-fields";

programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";

Expand All @@ -124,14 +134,17 @@ struct TsvJoinOptions
"help-verbose", " Print full help.", &helpVerbose,
"f|filter-file", "FILE (Required) File with records to use as a filter.", &filterFile,

"k|key-fields", "<field-list> Fields to use as join key. Default: 0 (entire line).",
keyFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero),
keyFieldsOptionString,
"<field-list> Fields to use as join key. Default: 0 (entire line).",
&keyFieldsArg,

"d|data-fields", "<field-list> Data record fields to use as join key, if different than --key-fields.",
dataFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero),
dataFieldsOptionString,
"<field-list> Data record fields to use as join key, if different than --key-fields.",
&dataFieldsArg,

"a|append-fields", "<field-list> Filter fields to append to matched records.",
appendFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero),
appendFieldsOptionString,
"<field-list> Filter fields to append to matched records.",
&appendFieldsArg,

std.getopt.config.caseSensitive,
"H|header", " Treat the first line of each file as a header.", &hasHeader,
Expand Down Expand Up @@ -181,6 +194,64 @@ struct TsvJoinOptions
ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader;
inputSources = inputSourceRange(filepaths, readHeader);

/* Field-list args (--k|key-fields, --d|data-fields --a|append-fields are
* parsed after header lines from the filter file and first data file have
* been read. The files were opened in the previous step, when setting up
* the 'filterSource' and 'inputSources' ranges.
*
* The field-list parsing step translates any named fields to one-based
* field numbers. Note that a named field may have different field
* numbers in the filter file and data files.
*
* The 'derivations()' method works off the one-based indices, converting
* them to zero-based. It also handles the full-line cases.
*/

string[] filterFileHeaderFields;
string[] inputSourceHeaderFields;

if (hasHeader && !filterSource.front.byLine.empty)
{
filterFileHeaderFields = filterSource.front.byLine.front.split(delim).to!(string[]);
}

if (hasHeader) inputSourceHeaderFields = inputSources.front.header.split(delim).to!(string[]);

if (!keyFieldsArg.empty)
{
keyFields =
keyFieldsArg
.parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)
(hasHeader, filterFileHeaderFields, keyFieldsOptionString)
.array;
}

if (!dataFieldsArg.empty)
{
dataFields =
dataFieldsArg
.parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)
(hasHeader, inputSourceHeaderFields, dataFieldsOptionString)
.array;
}
else if (!keyFieldsArg.empty)
{
dataFields =
keyFieldsArg
.parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)
(hasHeader, inputSourceHeaderFields, dataFieldsOptionString)
.array;
}

if (!appendFieldsArg.empty)
{
appendFields =
appendFieldsArg
.parseFieldList!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero)
(hasHeader, filterFileHeaderFields, appendFieldsOptionString)
.array;
}

consistencyValidations(cmdArgs);
derivations();
}
Expand Down
Loading