Expended CSV documentation.

This commit is contained in:
Jesse Phillips 2011-11-05 13:37:13 -07:00
parent 5fddab421f
commit f6f5ce9d89

224
std/csv.d
View file

@ -8,7 +8,7 @@
* tabular data. It has been common for programs to use their own
* variant of the CSV format. This parser will loosely follow the
* $(WEB tools.ietf.org/html/rfc4180, RFC-4180). CSV input should adhered
* to the following rules.
* to the following criteria, differences from RFC-4180 in parentheses.
*
* $(UL
* $(LI A record is separated by a new line (CRLF,LF,CR))
@ -18,7 +18,7 @@
* $(LI A field containing new lines, commas, or double quotes
* should be enclosed in double quotes (customizable))
* $(LI Double quotes in a field are escaped with a double quote)
* $(LI Each record should contain the same number of fields)
* $(LI Each record should contain the same number of fields (not enforced))
* )
*
* Example:
@ -30,9 +30,9 @@
*
* void main()
* {
* auto text = "Joe,Carpenter,300000\nFred,Fly,4\r\n";
* auto text = "Joe,Carpenter,300000\nFred,Blacksmith,400000\r\n";
*
* foreach(record; csvText!(Tuple!(string,string,int))(text))
* foreach(record; csvReader!(Tuple!(string,string,int))(text))
* {
* writefln("%s works as a %s and earns $%d per year",
* record[0], record[1], record[2]);
@ -122,18 +122,22 @@ enum Malformed
}
/**
* Builds a RecordList range for iterating over records found in input.
* Builds a $(LREF Records) struct for iterating over records found in $(D
* input).
*
* This function simplifies the process for standard text input.
* For other input create RecordList yourself.
* For other input, delimited by colon, create Records yourself.
*
* The Contents of the input can be provided if all the records are the same
* type such as all integer data:
* The $(D ErrorLevel) can be set to $(LREF Malformed).ignore if best guess
* processing should take place.
*
* The $(D Contents) of the input can be provided if all the records are the
* same type such as all integer data:
*
* -------
* string str = `76,26,22`;
* int[] ans = [76,26,22];
* auto records = csvText!int(str);
* auto records = csvReader!int(str);
*
* int count;
* foreach(record; records) {
@ -141,15 +145,6 @@ enum Malformed
* }
* -------
*
* The ErrorLevel can be set to $(LREF Malformed).ignore if best guess
* processing should take place.
*
* An optional heading can be provided. The first record will be read in as the
* heading. If the Content type is a struct then the heading provided is
* expected to correspond to the fields in the struct. When Content is
* non-struct the heading must be provided in the same order as the input or an
* exception is thrown.
*
* Example using a struct:
*
* -------
@ -160,7 +155,7 @@ enum Malformed
* double other;
* }
*
* auto records = csvText!Layout(str);
* auto records = csvReader!Layout(str);
*
* foreach(record; records) {
* writeln(record.name);
@ -169,18 +164,37 @@ enum Malformed
* }
* -------
*
* The header can be provided to identify which columns to read in.
* An optional $(D heading) can be provided. The first record will be read in
* as the heading. If $(D Contents) is a struct then the heading provided is
* expected to correspond to the fields in the struct. When $(D Contents) is
* non-struct the $(D heading) must be provided in the same order as the input
* or an exception is thrown.
*
* Read only column "b":
*
* -------
* string str = "a,b,c\nHello,65,63.63\nWorld,123,3673.562";
* auto records = csvText(str, ["b"]);
* auto records = csvReader(str, ["b"]);
*
* auto ans = ["65","123"];
* foreach(record; records)
* foreach(cell; record) {
* assert(cell == ans.front);
* ans.popFront();
* }
* auto ans = [["65"],["123"]];
* foreach(record; records) {
* assert(equal(record, ans.front));
* ans.popFront();
* }
* -------
*
* Read from heading of different order:
*
* -------
* string str = "a,b,c\nHello,65,63.63\nWorld,123,3673.562";
* struct Layout
* {
* int value;
* double other;
* string name;
* }
*
* auto records = csvReader!Layout(str, ["b","c","a"]);
* -------
*
* The header can also be left empty if the input contains a header but
@ -189,7 +203,7 @@ enum Malformed
*
* -------
* string str = "a,b,c\nHello,65,63.63\nWorld,123,3673.562";
* auto records = csvText(str, cast(string[])null);
* auto records = csvReader(str, cast(string[])null);
*
* assert(records.heading == ["a","b","c"]);
* -------
@ -198,11 +212,8 @@ enum Malformed
* nulls) prevents just sending null or [] as a header.
*
* Returns:
* If Contents is a struct, the range will return the
* struct populated by a single record.
*
* Otherwise the range will return a $(LREF Record) range of the type
* (default string).
* $(LREF Records) struct which provides a $(XREF range, InputRange) of
* each record.
*
* Throws:
* IncompleteCellException When a quote is found in an unquoted field,
@ -213,22 +224,23 @@ enum Malformed
* column is not found or the order did not match that found in the input
* (non-struct).
*/
auto csvText(Contents = string, Malformed ErrorLevel
auto csvReader(Contents = string, Malformed ErrorLevel
= Malformed.throwException, Range)(Range input)
if(isInputRange!Range && isSomeChar!(ElementType!Range)
&& !is(Contents == class))
{
return RecordList!(Contents,ErrorLevel,Range,ElementType!Range)
return Records!(Contents,ErrorLevel,Range,ElementType!Range,string[])
(input, ',', '"');
}
/// Ditto
auto csvText(Contents = string, Malformed ErrorLevel
= Malformed.throwException, Range)(Range input, string[] heading)
auto csvReader(Contents = string, Malformed ErrorLevel
= Malformed.throwException, Range, Heading)
(Range input, Heading heading)
if(isInputRange!Range && isSomeChar!(ElementType!Range)
&& !is(Contents == class))
&& !is(Contents == class) && isInputRange!Heading)
{
return RecordList!(Contents,ErrorLevel,Range,ElementType!Range)
return Records!(Contents,ErrorLevel,Range,ElementType!Range,Heading)
(input, ',', '"', heading);
}
@ -236,7 +248,7 @@ auto csvText(Contents = string, Malformed ErrorLevel
unittest
{
string str = `one,two,"three ""quoted""","",` ~ "\"five\nnew line\"\nsix";
auto records = csvText(str);
auto records = csvReader(str);
int count;
foreach(record; records)
@ -253,7 +265,7 @@ unittest
unittest
{
string str = "one,two\nthree,four\n";
auto records = csvText(str);
auto records = csvReader(str);
records.popFront();
records.popFront();
assert(records.empty);
@ -277,7 +289,7 @@ unittest {
ans[1].value = 65;
ans[1].other = 663.63;
auto records = csvText!Layout(str);
auto records = csvReader!Layout(str);
int count;
foreach(record; records)
@ -295,7 +307,7 @@ unittest
{
string str = `76,26,22`;
int[] ans = [76,26,22];
auto records = csvText!int(str);
auto records = csvReader!int(str);
foreach(record; records)
{
@ -314,7 +326,7 @@ unittest
string name;
}
auto records = csvText!Layout(str, ["b","c","a"]);
auto records = csvReader!Layout(str, ["b","c","a"]);
Layout ans[2];
ans[0].name = "Hello";
@ -340,51 +352,46 @@ unittest
unittest
{
string str = "a,b,c\nHello,65,63.63\nWorld,123,3673.562";
auto records = csvText(str, ["b"]);
auto records = csvReader(str, ["b"]);
auto ans = ["65","123"];
foreach(record; records)
foreach(cell; record) {
assert(cell == ans.front);
ans.popFront();
}
auto ans = [["65"],["123"]];
foreach(record; records) {
assert(equal(record, ans.front));
ans.popFront();
}
try
{
records = csvText(str, ["b","a"]);
records = csvReader(str, ["b","a"]);
assert(0);
}
catch(Exception e)
{
}
auto records2 = csvText!(string, Malformed.ignore)(str, ["b","a"]);
auto records2 = csvReader!(string, Malformed.ignore)(str, ["b","a"]);
ans = ["Hello","65","World","123"];
foreach(record; records2)
foreach(cell; record)
{
assert(cell == ans.front);
ans.popFront();
}
ans = [["Hello","65"],["World","123"]];
foreach(record; records2) {
assert(equal(record, ans.front));
ans.popFront();
}
str = "a,c,e\nJoe,Carpenter,300000\nFred,Fly,4";
records2 = csvText!(string, Malformed.ignore)(str, ["a","b","c","d"]);
records2 = csvReader!(string, Malformed.ignore)(str, ["a","b","c","d"]);
ans = ["Joe","Carpenter","Fred","Fly"];
foreach(record; records2)
foreach(cell; record)
{
assert(cell == ans.front);
ans.popFront();
}
ans = [["Joe","Carpenter"],["Fred","Fly"]];
foreach(record; records2) {
assert(equal(record, ans.front));
ans.popFront();
}
}
// Test null header interface
unittest
{
string str = "a,b,c\nHello,65,63.63\nWorld,123,3673.562";
auto records = csvText(str, cast(string[])null);
auto records = csvReader(str, ["a"]);
assert(records.heading == ["a","b","c"]);
}
@ -393,7 +400,7 @@ unittest
unittest
{
string str = "one \"quoted\"";
foreach(record; csvText!(string, Malformed.ignore)(str))
foreach(record; csvReader!(string, Malformed.ignore)(str))
{
foreach(cell; record)
{
@ -406,7 +413,7 @@ unittest
{
string a,b;
}
foreach(record; csvText!(Ans, Malformed.ignore)(str))
foreach(record; csvReader!(Ans, Malformed.ignore)(str))
{
assert(record.a == "one \"quoted\"");
assert(record.b == "two \"quoted\" end");
@ -418,7 +425,7 @@ unittest
{
string str = "one,two\r\nthree";
auto records = csvText(str);
auto records = csvReader(str);
auto record = records.front;
assert(record.front == "one");
record.popFront();
@ -431,7 +438,7 @@ unittest
/**
* Range for iterating CSV records.
*
* This range is returned by the csvText functions. It can be
* This range is returned by the csvReader functions. It can be
* created in a similar manner to allow for custom separation.
*
* Example for integer data:
@ -439,7 +446,7 @@ unittest
* -------
* string str = `76;^26^;22`;
* int[] ans = [76,26,22];
* auto records = RecordList!(int,Malformed.ignore,string,char)
* auto records = Records!(int,Malformed.ignore,string,char,char[])
* (str, ';', '^');
*
* foreach(record; records) {
@ -448,9 +455,10 @@ unittest
* -------
*
*/
struct RecordList(Contents, Malformed ErrorLevel, Range, Separator)
struct Records(Contents, Malformed ErrorLevel, Range, Separator, Heading)
if(isSomeChar!Separator && isInputRange!Range
&& isSomeChar!(ElementType!Range) && !is(Contents == class))
&& isSomeChar!(ElementType!Range) && !is(Contents == class)
&& isInputRange!Heading)
{
private:
Range _input;
@ -466,12 +474,32 @@ private:
else
Record!(Contents, ErrorLevel, Range, Separator) recordRange;
public:
/// Heading from the input in array form.
/**
* Heading from the input in array form.
*
* -------
* string str = "a,b,c\nHello,65,63.63";
* auto records = csvReader(str, ["a"]);
*
* assert(records.heading == ["a","b","c"]);
* -------
*/
Range[] heading;
/**
* Constructor to initialize the input, delimiter and quote for input
* without a heading.
*
* -------
* string str = `76;^26^;22`;
* int[] ans = [76,26,22];
* auto records = Records!(int,Malformed.ignore,string,char,string[])
* (str, ';', '^');
*
* foreach(record; records) {
* assert(equal(record, ans));
* }
* -------
*/
this(Range input, Separator delimiter, Separator quote)
{
@ -492,19 +520,30 @@ public:
* Constructor to initialize the input, delimiter and quote for input
* with a heading.
*
* -------
* string str = `high;mean;low\n76;^26^;22`;
* int[] ans = [76,22];
* auto records = Records!(int,Malformed.ignore,string,char,string[])
* (str, ';', '^',["high","low"]);
*
* foreach(record; records) {
* assert(equal(record, ans));
* }
* -------
*
* Throws:
* HeadingMismatchException when a heading is provided but a
* matching column is not found or the order did not match that found
* in the input (non-struct).
*/
this(Range input, Separator delimiter, Separator quote, string[] colHeaders)
this(Range input, Separator delimiter, Separator quote, Heading colHeaders)
{
_input = input;
_separator = delimiter;
_quote = quote;
size_t[string] colToIndex;
foreach(i, h; colHeaders)
foreach(h; colHeaders)
{
colToIndex[h] = size_t.max;
}
@ -522,14 +561,15 @@ public:
colIndex++;
}
indices.length = colHeaders.length;
foreach(i, h; colHeaders)
indices.length = colToIndex.length;
int i;
foreach(h; colHeaders)
{
immutable index = colToIndex[h];
static if(ErrorLevel != Malformed.ignore)
enforceEx!(HeadingMismatchException)(index < size_t.max,
"Header not found: " ~ to!string(h));
indices[i] = index;
indices[i++] = index;
}
static if(!is(Contents == struct))
@ -554,6 +594,13 @@ public:
}
/**
* Part of the $(XREF range, InputRange) interface.
*
* Returns:
* If $(D Contents) is a struct, the struct will be filled with record
* data.
*
* If $(D Contents) is non-struct, a $(LREF Record) will be returned.
*/
@property auto front()
{
@ -570,6 +617,7 @@ public:
}
/**
* Part of the $(XREF range, InputRange) interface.
*/
@property bool empty()
{
@ -577,7 +625,7 @@ public:
}
/**
* Brings the next Record into the front of the range.
* Part of the $(XREF range, InputRange) interface.
*
* Throws:
* IncompleteCellException When a quote is found in an unquoted field,
@ -661,7 +709,7 @@ public:
unittest {
string str = `76;^26^;22`;
int[] ans = [76,26,22];
auto records = RecordList!(int,Malformed.ignore,string,char)
auto records = Records!(int,Malformed.ignore,string,char,string[])
(str, ';', '^');
foreach(record; records)
@ -671,7 +719,7 @@ unittest {
}
/**
* Returned by a RecordList when Contents is a non-struct.
* Returned by a Records when Contents is a non-struct.
*/
private struct Record(Contents, Malformed ErrorLevel, Range, Separator)
if(!is(Contents == class) && !is(Contents == struct))
@ -685,7 +733,7 @@ private:
bool _empty;
size_t[] _popCount;
public:
/**
/*
* params:
* input = Pointer to a character input range
* delimiter = Separator for each column
@ -722,6 +770,7 @@ public:
}
/**
* Part of the $(XREF range, InputRange) interface.
*/
@property Contents front()
{
@ -730,6 +779,7 @@ public:
}
/**
* Part of the $(XREF range, InputRange) interface.
*/
@property bool empty()
{
@ -753,7 +803,7 @@ public:
/**
* Brings the next Content into the front of the range.
* Part of the $(XREF range, InputRange) interface.
*
* Throws:
* IncompleteCellException When a quote is found in an unquoted field,