Expended CSV documentation.

This commit is contained in:
Jesse Phillips 2011-11-05 13:37:13 -07:00
parent 5fddab421f
commit f6f5ce9d89

208
std/csv.d
View file

@ -8,7 +8,7 @@
* tabular data. It has been common for programs to use their own * tabular data. It has been common for programs to use their own
* variant of the CSV format. This parser will loosely follow the * variant of the CSV format. This parser will loosely follow the
* $(WEB tools.ietf.org/html/rfc4180, RFC-4180). CSV input should adhered * $(WEB tools.ietf.org/html/rfc4180, RFC-4180). CSV input should adhered
* to the following rules. * to the following criteria, differences from RFC-4180 in parentheses.
* *
* $(UL * $(UL
* $(LI A record is separated by a new line (CRLF,LF,CR)) * $(LI A record is separated by a new line (CRLF,LF,CR))
@ -18,7 +18,7 @@
* $(LI A field containing new lines, commas, or double quotes * $(LI A field containing new lines, commas, or double quotes
* should be enclosed in double quotes (customizable)) * should be enclosed in double quotes (customizable))
* $(LI Double quotes in a field are escaped with a double quote) * $(LI Double quotes in a field are escaped with a double quote)
* $(LI Each record should contain the same number of fields) * $(LI Each record should contain the same number of fields (not enforced))
* ) * )
* *
* Example: * Example:
@ -30,9 +30,9 @@
* *
* void main() * void main()
* { * {
* auto text = "Joe,Carpenter,300000\nFred,Fly,4\r\n"; * auto text = "Joe,Carpenter,300000\nFred,Blacksmith,400000\r\n";
* *
* foreach(record; csvText!(Tuple!(string,string,int))(text)) * foreach(record; csvReader!(Tuple!(string,string,int))(text))
* { * {
* writefln("%s works as a %s and earns $%d per year", * writefln("%s works as a %s and earns $%d per year",
* record[0], record[1], record[2]); * record[0], record[1], record[2]);
@ -122,18 +122,22 @@ enum Malformed
} }
/** /**
* Builds a RecordList range for iterating over records found in input. * Builds a $(LREF Records) struct for iterating over records found in $(D
* input).
* *
* This function simplifies the process for standard text input. * This function simplifies the process for standard text input.
* For other input create RecordList yourself. * For other input, delimited by colon, create Records yourself.
* *
* The Contents of the input can be provided if all the records are the same * The $(D ErrorLevel) can be set to $(LREF Malformed).ignore if best guess
* type such as all integer data: * processing should take place.
*
* The $(D Contents) of the input can be provided if all the records are the
* same type such as all integer data:
* *
* ------- * -------
* string str = `76,26,22`; * string str = `76,26,22`;
* int[] ans = [76,26,22]; * int[] ans = [76,26,22];
* auto records = csvText!int(str); * auto records = csvReader!int(str);
* *
* int count; * int count;
* foreach(record; records) { * foreach(record; records) {
@ -141,15 +145,6 @@ enum Malformed
* } * }
* ------- * -------
* *
* The ErrorLevel can be set to $(LREF Malformed).ignore if best guess
* processing should take place.
*
* An optional heading can be provided. The first record will be read in as the
* heading. If the Content type is a struct then the heading provided is
* expected to correspond to the fields in the struct. When Content is
* non-struct the heading must be provided in the same order as the input or an
* exception is thrown.
*
* Example using a struct: * Example using a struct:
* *
* ------- * -------
@ -160,7 +155,7 @@ enum Malformed
* double other; * double other;
* } * }
* *
* auto records = csvText!Layout(str); * auto records = csvReader!Layout(str);
* *
* foreach(record; records) { * foreach(record; records) {
* writeln(record.name); * writeln(record.name);
@ -169,27 +164,46 @@ enum Malformed
* } * }
* ------- * -------
* *
* The header can be provided to identify which columns to read in. * An optional $(D heading) can be provided. The first record will be read in
* as the heading. If $(D Contents) is a struct then the heading provided is
* expected to correspond to the fields in the struct. When $(D Contents) is
* non-struct the $(D heading) must be provided in the same order as the input
* or an exception is thrown.
*
* Read only column "b":
* *
* ------- * -------
* string str = "a,b,c\nHello,65,63.63\nWorld,123,3673.562"; * string str = "a,b,c\nHello,65,63.63\nWorld,123,3673.562";
* auto records = csvText(str, ["b"]); * auto records = csvReader(str, ["b"]);
* *
* auto ans = ["65","123"]; * auto ans = [["65"],["123"]];
* foreach(record; records) * foreach(record; records) {
* foreach(cell; record) { * assert(equal(record, ans.front));
* assert(cell == ans.front);
* ans.popFront(); * ans.popFront();
* } * }
* ------- * -------
* *
* Read from heading of different order:
*
* -------
* string str = "a,b,c\nHello,65,63.63\nWorld,123,3673.562";
* struct Layout
* {
* int value;
* double other;
* string name;
* }
*
* auto records = csvReader!Layout(str, ["b","c","a"]);
* -------
*
* The header can also be left empty if the input contains a header but * The header can also be left empty if the input contains a header but
* all columns should be iterated. The heading from the input can always * all columns should be iterated. The heading from the input can always
* be accessed from the heading field. * be accessed from the heading field.
* *
* ------- * -------
* string str = "a,b,c\nHello,65,63.63\nWorld,123,3673.562"; * string str = "a,b,c\nHello,65,63.63\nWorld,123,3673.562";
* auto records = csvText(str, cast(string[])null); * auto records = csvReader(str, cast(string[])null);
* *
* assert(records.heading == ["a","b","c"]); * assert(records.heading == ["a","b","c"]);
* ------- * -------
@ -198,11 +212,8 @@ enum Malformed
* nulls) prevents just sending null or [] as a header. * nulls) prevents just sending null or [] as a header.
* *
* Returns: * Returns:
* If Contents is a struct, the range will return the * $(LREF Records) struct which provides a $(XREF range, InputRange) of
* struct populated by a single record. * each record.
*
* Otherwise the range will return a $(LREF Record) range of the type
* (default string).
* *
* Throws: * Throws:
* IncompleteCellException When a quote is found in an unquoted field, * IncompleteCellException When a quote is found in an unquoted field,
@ -213,22 +224,23 @@ enum Malformed
* column is not found or the order did not match that found in the input * column is not found or the order did not match that found in the input
* (non-struct). * (non-struct).
*/ */
auto csvText(Contents = string, Malformed ErrorLevel auto csvReader(Contents = string, Malformed ErrorLevel
= Malformed.throwException, Range)(Range input) = Malformed.throwException, Range)(Range input)
if(isInputRange!Range && isSomeChar!(ElementType!Range) if(isInputRange!Range && isSomeChar!(ElementType!Range)
&& !is(Contents == class)) && !is(Contents == class))
{ {
return RecordList!(Contents,ErrorLevel,Range,ElementType!Range) return Records!(Contents,ErrorLevel,Range,ElementType!Range,string[])
(input, ',', '"'); (input, ',', '"');
} }
/// Ditto /// Ditto
auto csvText(Contents = string, Malformed ErrorLevel auto csvReader(Contents = string, Malformed ErrorLevel
= Malformed.throwException, Range)(Range input, string[] heading) = Malformed.throwException, Range, Heading)
(Range input, Heading heading)
if(isInputRange!Range && isSomeChar!(ElementType!Range) if(isInputRange!Range && isSomeChar!(ElementType!Range)
&& !is(Contents == class)) && !is(Contents == class) && isInputRange!Heading)
{ {
return RecordList!(Contents,ErrorLevel,Range,ElementType!Range) return Records!(Contents,ErrorLevel,Range,ElementType!Range,Heading)
(input, ',', '"', heading); (input, ',', '"', heading);
} }
@ -236,7 +248,7 @@ auto csvText(Contents = string, Malformed ErrorLevel
unittest unittest
{ {
string str = `one,two,"three ""quoted""","",` ~ "\"five\nnew line\"\nsix"; string str = `one,two,"three ""quoted""","",` ~ "\"five\nnew line\"\nsix";
auto records = csvText(str); auto records = csvReader(str);
int count; int count;
foreach(record; records) foreach(record; records)
@ -253,7 +265,7 @@ unittest
unittest unittest
{ {
string str = "one,two\nthree,four\n"; string str = "one,two\nthree,four\n";
auto records = csvText(str); auto records = csvReader(str);
records.popFront(); records.popFront();
records.popFront(); records.popFront();
assert(records.empty); assert(records.empty);
@ -277,7 +289,7 @@ unittest {
ans[1].value = 65; ans[1].value = 65;
ans[1].other = 663.63; ans[1].other = 663.63;
auto records = csvText!Layout(str); auto records = csvReader!Layout(str);
int count; int count;
foreach(record; records) foreach(record; records)
@ -295,7 +307,7 @@ unittest
{ {
string str = `76,26,22`; string str = `76,26,22`;
int[] ans = [76,26,22]; int[] ans = [76,26,22];
auto records = csvText!int(str); auto records = csvReader!int(str);
foreach(record; records) foreach(record; records)
{ {
@ -314,7 +326,7 @@ unittest
string name; string name;
} }
auto records = csvText!Layout(str, ["b","c","a"]); auto records = csvReader!Layout(str, ["b","c","a"]);
Layout ans[2]; Layout ans[2];
ans[0].name = "Hello"; ans[0].name = "Hello";
@ -340,42 +352,37 @@ unittest
unittest unittest
{ {
string str = "a,b,c\nHello,65,63.63\nWorld,123,3673.562"; string str = "a,b,c\nHello,65,63.63\nWorld,123,3673.562";
auto records = csvText(str, ["b"]); auto records = csvReader(str, ["b"]);
auto ans = ["65","123"]; auto ans = [["65"],["123"]];
foreach(record; records) foreach(record; records) {
foreach(cell; record) { assert(equal(record, ans.front));
assert(cell == ans.front);
ans.popFront(); ans.popFront();
} }
try try
{ {
records = csvText(str, ["b","a"]); records = csvReader(str, ["b","a"]);
assert(0); assert(0);
} }
catch(Exception e) catch(Exception e)
{ {
} }
auto records2 = csvText!(string, Malformed.ignore)(str, ["b","a"]); auto records2 = csvReader!(string, Malformed.ignore)(str, ["b","a"]);
ans = ["Hello","65","World","123"]; ans = [["Hello","65"],["World","123"]];
foreach(record; records2) foreach(record; records2) {
foreach(cell; record) assert(equal(record, ans.front));
{
assert(cell == ans.front);
ans.popFront(); ans.popFront();
} }
str = "a,c,e\nJoe,Carpenter,300000\nFred,Fly,4"; str = "a,c,e\nJoe,Carpenter,300000\nFred,Fly,4";
records2 = csvText!(string, Malformed.ignore)(str, ["a","b","c","d"]); records2 = csvReader!(string, Malformed.ignore)(str, ["a","b","c","d"]);
ans = ["Joe","Carpenter","Fred","Fly"]; ans = [["Joe","Carpenter"],["Fred","Fly"]];
foreach(record; records2) foreach(record; records2) {
foreach(cell; record) assert(equal(record, ans.front));
{
assert(cell == ans.front);
ans.popFront(); ans.popFront();
} }
} }
@ -384,7 +391,7 @@ unittest
unittest unittest
{ {
string str = "a,b,c\nHello,65,63.63\nWorld,123,3673.562"; string str = "a,b,c\nHello,65,63.63\nWorld,123,3673.562";
auto records = csvText(str, cast(string[])null); auto records = csvReader(str, ["a"]);
assert(records.heading == ["a","b","c"]); assert(records.heading == ["a","b","c"]);
} }
@ -393,7 +400,7 @@ unittest
unittest unittest
{ {
string str = "one \"quoted\""; string str = "one \"quoted\"";
foreach(record; csvText!(string, Malformed.ignore)(str)) foreach(record; csvReader!(string, Malformed.ignore)(str))
{ {
foreach(cell; record) foreach(cell; record)
{ {
@ -406,7 +413,7 @@ unittest
{ {
string a,b; string a,b;
} }
foreach(record; csvText!(Ans, Malformed.ignore)(str)) foreach(record; csvReader!(Ans, Malformed.ignore)(str))
{ {
assert(record.a == "one \"quoted\""); assert(record.a == "one \"quoted\"");
assert(record.b == "two \"quoted\" end"); assert(record.b == "two \"quoted\" end");
@ -418,7 +425,7 @@ unittest
{ {
string str = "one,two\r\nthree"; string str = "one,two\r\nthree";
auto records = csvText(str); auto records = csvReader(str);
auto record = records.front; auto record = records.front;
assert(record.front == "one"); assert(record.front == "one");
record.popFront(); record.popFront();
@ -431,7 +438,7 @@ unittest
/** /**
* Range for iterating CSV records. * Range for iterating CSV records.
* *
* This range is returned by the csvText functions. It can be * This range is returned by the csvReader functions. It can be
* created in a similar manner to allow for custom separation. * created in a similar manner to allow for custom separation.
* *
* Example for integer data: * Example for integer data:
@ -439,7 +446,7 @@ unittest
* ------- * -------
* string str = `76;^26^;22`; * string str = `76;^26^;22`;
* int[] ans = [76,26,22]; * int[] ans = [76,26,22];
* auto records = RecordList!(int,Malformed.ignore,string,char) * auto records = Records!(int,Malformed.ignore,string,char,char[])
* (str, ';', '^'); * (str, ';', '^');
* *
* foreach(record; records) { * foreach(record; records) {
@ -448,9 +455,10 @@ unittest
* ------- * -------
* *
*/ */
struct RecordList(Contents, Malformed ErrorLevel, Range, Separator) struct Records(Contents, Malformed ErrorLevel, Range, Separator, Heading)
if(isSomeChar!Separator && isInputRange!Range if(isSomeChar!Separator && isInputRange!Range
&& isSomeChar!(ElementType!Range) && !is(Contents == class)) && isSomeChar!(ElementType!Range) && !is(Contents == class)
&& isInputRange!Heading)
{ {
private: private:
Range _input; Range _input;
@ -466,12 +474,32 @@ private:
else else
Record!(Contents, ErrorLevel, Range, Separator) recordRange; Record!(Contents, ErrorLevel, Range, Separator) recordRange;
public: public:
/// Heading from the input in array form. /**
* Heading from the input in array form.
*
* -------
* string str = "a,b,c\nHello,65,63.63";
* auto records = csvReader(str, ["a"]);
*
* assert(records.heading == ["a","b","c"]);
* -------
*/
Range[] heading; Range[] heading;
/** /**
* Constructor to initialize the input, delimiter and quote for input * Constructor to initialize the input, delimiter and quote for input
* without a heading. * without a heading.
*
* -------
* string str = `76;^26^;22`;
* int[] ans = [76,26,22];
* auto records = Records!(int,Malformed.ignore,string,char,string[])
* (str, ';', '^');
*
* foreach(record; records) {
* assert(equal(record, ans));
* }
* -------
*/ */
this(Range input, Separator delimiter, Separator quote) this(Range input, Separator delimiter, Separator quote)
{ {
@ -492,19 +520,30 @@ public:
* Constructor to initialize the input, delimiter and quote for input * Constructor to initialize the input, delimiter and quote for input
* with a heading. * with a heading.
* *
* -------
* string str = `high;mean;low\n76;^26^;22`;
* int[] ans = [76,22];
* auto records = Records!(int,Malformed.ignore,string,char,string[])
* (str, ';', '^',["high","low"]);
*
* foreach(record; records) {
* assert(equal(record, ans));
* }
* -------
*
* Throws: * Throws:
* HeadingMismatchException when a heading is provided but a * HeadingMismatchException when a heading is provided but a
* matching column is not found or the order did not match that found * matching column is not found or the order did not match that found
* in the input (non-struct). * in the input (non-struct).
*/ */
this(Range input, Separator delimiter, Separator quote, string[] colHeaders) this(Range input, Separator delimiter, Separator quote, Heading colHeaders)
{ {
_input = input; _input = input;
_separator = delimiter; _separator = delimiter;
_quote = quote; _quote = quote;
size_t[string] colToIndex; size_t[string] colToIndex;
foreach(i, h; colHeaders) foreach(h; colHeaders)
{ {
colToIndex[h] = size_t.max; colToIndex[h] = size_t.max;
} }
@ -522,14 +561,15 @@ public:
colIndex++; colIndex++;
} }
indices.length = colHeaders.length; indices.length = colToIndex.length;
foreach(i, h; colHeaders) int i;
foreach(h; colHeaders)
{ {
immutable index = colToIndex[h]; immutable index = colToIndex[h];
static if(ErrorLevel != Malformed.ignore) static if(ErrorLevel != Malformed.ignore)
enforceEx!(HeadingMismatchException)(index < size_t.max, enforceEx!(HeadingMismatchException)(index < size_t.max,
"Header not found: " ~ to!string(h)); "Header not found: " ~ to!string(h));
indices[i] = index; indices[i++] = index;
} }
static if(!is(Contents == struct)) static if(!is(Contents == struct))
@ -554,6 +594,13 @@ public:
} }
/** /**
* Part of the $(XREF range, InputRange) interface.
*
* Returns:
* If $(D Contents) is a struct, the struct will be filled with record
* data.
*
* If $(D Contents) is non-struct, a $(LREF Record) will be returned.
*/ */
@property auto front() @property auto front()
{ {
@ -570,6 +617,7 @@ public:
} }
/** /**
* Part of the $(XREF range, InputRange) interface.
*/ */
@property bool empty() @property bool empty()
{ {
@ -577,7 +625,7 @@ public:
} }
/** /**
* Brings the next Record into the front of the range. * Part of the $(XREF range, InputRange) interface.
* *
* Throws: * Throws:
* IncompleteCellException When a quote is found in an unquoted field, * IncompleteCellException When a quote is found in an unquoted field,
@ -661,7 +709,7 @@ public:
unittest { unittest {
string str = `76;^26^;22`; string str = `76;^26^;22`;
int[] ans = [76,26,22]; int[] ans = [76,26,22];
auto records = RecordList!(int,Malformed.ignore,string,char) auto records = Records!(int,Malformed.ignore,string,char,string[])
(str, ';', '^'); (str, ';', '^');
foreach(record; records) foreach(record; records)
@ -671,7 +719,7 @@ unittest {
} }
/** /**
* Returned by a RecordList when Contents is a non-struct. * Returned by a Records when Contents is a non-struct.
*/ */
private struct Record(Contents, Malformed ErrorLevel, Range, Separator) private struct Record(Contents, Malformed ErrorLevel, Range, Separator)
if(!is(Contents == class) && !is(Contents == struct)) if(!is(Contents == class) && !is(Contents == struct))
@ -685,7 +733,7 @@ private:
bool _empty; bool _empty;
size_t[] _popCount; size_t[] _popCount;
public: public:
/** /*
* params: * params:
* input = Pointer to a character input range * input = Pointer to a character input range
* delimiter = Separator for each column * delimiter = Separator for each column
@ -722,6 +770,7 @@ public:
} }
/** /**
* Part of the $(XREF range, InputRange) interface.
*/ */
@property Contents front() @property Contents front()
{ {
@ -730,6 +779,7 @@ public:
} }
/** /**
* Part of the $(XREF range, InputRange) interface.
*/ */
@property bool empty() @property bool empty()
{ {
@ -753,7 +803,7 @@ public:
/** /**
* Brings the next Content into the front of the range. * Part of the $(XREF range, InputRange) interface.
* *
* Throws: * Throws:
* IncompleteCellException When a quote is found in an unquoted field, * IncompleteCellException When a quote is found in an unquoted field,