std.csv allow un equal number of value separators

By default `std.csv` will throw if there is an number of separators on a line
unequal to the number of separators of the first line.
To allow, or disallow, unequal numbers of separators a `bool` can be passed to
all overloads of the `csvReader` function as shown below.

```
string text = "76,26,22\n1,2\n3,4,5,6";
auto records = text.csvReader!int(',', '"', true);

assert(records.equal!equal([
    [76, 26, 22],
    [1, 2],
    [3, 4, 5, 6]
]));
```

working in the comments from the PR

more review changes

one more round of review fixes

there is always one left
This commit is contained in:
Robert burner Schadek 2021-08-26 13:56:41 +02:00 committed by The Dlang Bot
parent f9fc1f612a
commit ba80631325
2 changed files with 131 additions and 22 deletions

136
std/csv.d
View file

@ -57,7 +57,7 @@
* associative array. Passing null to signify that a header is present.
*
* -------
* auto text = "Name,Occupation,Salary\r"
* auto text = "Name,Occupation,Salary\r" ~
* "Joe,Carpenter,300000\nFred,Blacksmith,400000\r\n";
*
* foreach (record; csvReader!(string[string])
@ -338,14 +338,15 @@ Throws:
when the exception is thrown for different types of `Contents`.
*/
auto csvReader(Contents = string,Malformed ErrorLevel = Malformed.throwException, Range, Separator = char)(Range input,
Separator delimiter = ',', Separator quote = '"')
Separator delimiter = ',', Separator quote = '"',
bool allowInconsistentDelimiterCount = false)
if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar)
&& isSomeChar!(Separator)
&& !is(Contents T : T[U], U : string))
{
return CsvReader!(Contents,ErrorLevel,Range,
Unqual!(ElementType!Range),string[])
(input, delimiter, quote);
(input, delimiter, quote, allowInconsistentDelimiterCount);
}
/// ditto
@ -353,7 +354,8 @@ auto csvReader(Contents = string,
Malformed ErrorLevel = Malformed.throwException,
Range, Header, Separator = char)
(Range input, Header header,
Separator delimiter = ',', Separator quote = '"')
Separator delimiter = ',', Separator quote = '"',
bool allowInconsistentDelimiterCount = false)
if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar)
&& isSomeChar!(Separator)
&& isForwardRange!Header
@ -361,7 +363,7 @@ if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar)
{
return CsvReader!(Contents,ErrorLevel,Range,
Unqual!(ElementType!Range),Header)
(input, header, delimiter, quote);
(input, header, delimiter, quote, allowInconsistentDelimiterCount);
}
/// ditto
@ -369,14 +371,16 @@ auto csvReader(Contents = string,
Malformed ErrorLevel = Malformed.throwException,
Range, Header, Separator = char)
(Range input, Header header,
Separator delimiter = ',', Separator quote = '"')
Separator delimiter = ',', Separator quote = '"',
bool allowInconsistentDelimiterCount = false)
if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar)
&& isSomeChar!(Separator)
&& is(Header : typeof(null)))
{
return CsvReader!(Contents,ErrorLevel,Range,
Unqual!(ElementType!Range),string[])
(input, cast(string[]) null, delimiter, quote);
(input, cast(string[]) null, delimiter, quote,
allowInconsistentDelimiterCount);
}
@ -474,6 +478,66 @@ The header from the input can always be accessed from the `header` field.
assert(records.header == ["a","b","c"]);
}
/**
Handcrafted csv files tend to have an variable amount of columns.
By default `std.csv` will throw if the number of columns on a line
is unequal to the number of columns of the first line.
To allow, or disallow, a variable amount of columns a `bool` can be passed to
all overloads of the `csvReader` function as shown below.
*/
@safe unittest
{
import std.algorithm.comparison : equal;
string text = "76,26,22\n1,2\n3,4,5,6";
auto records = text.csvReader!int(',', '"', true);
assert(records.equal!equal([
[76, 26, 22],
[1, 2],
[3, 4, 5, 6]
]));
}
/// ditto
@safe unittest
{
import std.algorithm.comparison : equal;
static struct Three
{
int a;
int b;
int c;
}
string text = "76,26,22\n1,2\n3,4,5,6";
auto records = text.csvReader!Three(',', '"', true);
assert(records.equal([
Three(76, 26, 22),
Three(1, 2, 0),
Three(3, 4, 5)
]));
}
/// ditto
@safe unittest
{
import std.algorithm.comparison : equal;
auto text = "Name,Occupation,Salary\r" ~
"Joe,Carpenter,300000\nFred,Blacksmith\r\n";
auto r = csvReader!(string[string])(text, null, ',', '"', true);
assert(r.equal([
[ "Name" : "Joe", "Occupation" : "Carpenter", "Salary" : "300000" ],
[ "Name" : "Fred", "Occupation" : "Blacksmith" ]
]));
}
// Test standard iteration over input.
@safe pure unittest
{
@ -843,6 +907,7 @@ private:
Separator _quote;
size_t[] indices;
bool _empty;
bool _allowInconsistentDelimiterCount;
static if (is(Contents == struct) || is(Contents == class))
{
Contents recordContent;
@ -884,11 +949,13 @@ public:
* }
* -------
*/
this(Range input, Separator delimiter, Separator quote)
this(Range input, Separator delimiter, Separator quote,
bool allowInconsistentDelimiterCount)
{
_input = new Input!(Range, ErrorLevel)(input);
_separator = delimiter;
_quote = quote;
_allowInconsistentDelimiterCount = allowInconsistentDelimiterCount;
if (_input.range.empty)
{
@ -920,11 +987,13 @@ public:
* matching column is not found or the order did not match that found
* in the input (non-struct).
*/
this(Range input, Header colHeaders, Separator delimiter, Separator quote)
this(Range input, Header colHeaders, Separator delimiter, Separator quote,
bool allowInconsistentDelimiterCount)
{
_input = new Input!(Range, ErrorLevel)(input);
_separator = delimiter;
_quote = quote;
_allowInconsistentDelimiterCount = allowInconsistentDelimiterCount;
if (_input.range.empty)
{
@ -939,7 +1008,8 @@ public:
}
auto r = CsvRecord!(string, ErrorLevel, Range, Separator)
(_input, _separator, _quote, indices);
(_input, _separator, _quote, indices,
_allowInconsistentDelimiterCount);
size_t colIndex;
foreach (col; r)
@ -952,6 +1022,8 @@ public:
}
// The above loop empties the header row.
recordRange._empty = true;
recordRange._allowInconsistentDelimiterCount =
allowInconsistentDelimiterCount;
indices.length = colToIndex.length;
int i;
@ -1090,12 +1162,14 @@ public:
static if (is(Contents == struct) || is(Contents == class))
{
recordRange = typeof(recordRange)
(_input, _separator, _quote, null);
(_input, _separator, _quote, null,
_allowInconsistentDelimiterCount);
}
else
{
recordRange = typeof(recordRange)
(_input, _separator, _quote, indices);
(_input, _separator, _quote, indices,
_allowInconsistentDelimiterCount);
}
static if (is(Contents T : T[U], U : string))
@ -1168,7 +1242,7 @@ public:
string str = `76;^26^;22`;
int[] ans = [76,26,22];
auto records = CsvReader!(int,Malformed.ignore,string,char,string[])
(str, ';', '^');
(str, ';', '^', false);
foreach (record; records)
{
@ -1203,6 +1277,7 @@ private:
Contents curContentsoken;
typeof(appender!(dchar[])()) _front;
bool _empty;
bool _allowInconsistentDelimiterCount;
size_t[] _popCount;
public:
/*
@ -1214,7 +1289,8 @@ public:
* If empty, all columns are returned. List must be in order.
*/
this(Input!(Range, ErrorLevel)* input, Separator delimiter,
Separator quote, size_t[] indices)
Separator quote, size_t[] indices,
bool allowInconsistentDelimiterCount)
{
_input = input;
_separator = delimiter;
@ -1222,6 +1298,7 @@ public:
_front = appender!(dchar[])();
_popCount = indices.dup;
_allowInconsistentDelimiterCount = allowInconsistentDelimiterCount;
// If a header was given, each call to popFront will need
// to eliminate so many tokens. This calculates
@ -1304,23 +1381,38 @@ public:
{
_empty = true;
static if (ErrorLevel == Malformed.throwException)
if (_input.rowLength != 0)
if (_input.col != _input.rowLength)
throw new CSVException(
format("Row %s's length %s does not match "~
"previous length of %s.", _input.row,
_input.col, _input.rowLength));
{
if (_input.rowLength != 0 && _input.col != _input.rowLength
&& !_allowInconsistentDelimiterCount)
{
throw new CSVException(
format("Row %s's length %s does not match "~
"previous length of %s.", _input.row,
_input.col, _input.rowLength));
}
}
return;
}
else
{
static if (ErrorLevel == Malformed.throwException)
if (_input.rowLength != 0)
if (_input.col > _input.rowLength)
{
if (_input.rowLength != 0 && _input.col > _input.rowLength)
{
if (!_allowInconsistentDelimiterCount)
{
throw new CSVException(
format("Row %s's length %s does not match "~
"previous length of %s.", _input.row,
_input.col, _input.rowLength));
}
else
{
_empty = true;
return;
}
}
}
}
// Separator is left on the end of input from the last call.