diff --git a/dom.d b/dom.d index 71b1b2d..8fd451c 100644 --- a/dom.d +++ b/dom.d @@ -82,7 +82,7 @@ bool isConvenientAttribute(string name) { class Document : FileResource { /// Convenience method for web scraping. Requires [arsd.http2] to be /// included in the build as well as [arsd.characterencodings]. - static Document fromUrl()(string url) { + static Document fromUrl()(string url, bool strictMode = false) { import arsd.http2; auto client = new HttpClient(); @@ -90,7 +90,11 @@ class Document : FileResource { auto res = req.waitForCompletion(); auto document = new Document(); - document.parseGarbage(cast(string) res.content); + if(strictMode) { + document.parse(cast(string) res.content, true, true, res.contentTypeCharset); + } else { + document.parseGarbage(cast(string) res.content); + } return document; } diff --git a/http2.d b/http2.d index 634780f..e179516 100644 --- a/http2.d +++ b/http2.d @@ -160,6 +160,17 @@ struct HttpResponse { string contentType; /// The content type header string location; /// The location header + /// the charset out of content type, if present. `null` if not. + string contentTypeCharset() { + auto idx = contentType.indexOf("charset="); + if(idx == -1) + return null; + auto c = contentType[idx + "charset=".length .. $].strip; + if(c.length) + return c; + return null; + } + string[string] cookies; /// Names and values of cookies set in the response. string[] headers; /// Array of all headers returned. diff --git a/rss.d b/rss.d index f05c24f..f52484b 100644 --- a/rss.d +++ b/rss.d @@ -440,10 +440,15 @@ auto test1 = ` { auto e = parseRss(test1); - assert(e.items.length = 6); + assert(e.items.length == 6); assert(e.items[$-1].title == "Yournamehere.com more important than anything", e.items[$-1].title); assert(e.items[0].title == "Giving the world a pluggable Gnutella"); + assert(e.items[0].link == "http://writetheweb.com/read.php?item=24"); assert(e.image.url == "http://writetheweb.com/images/mynetscape88.gif"); + + auto df = e.toGenericFeed(); + assert(df.items.length == 6); + assert(df.items[0].link == "http://writetheweb.com/read.php?item=24"); } auto test2 = ` @@ -644,4 +649,59 @@ auto testAtom1 = ` assert(e.entries[0].content.html.length > 10); } + { + auto xml = ` + + NYT > World News + + https://www.nytimes.com/section/world?emc=rss&partner=rss + + + + en-us + Copyright 2019 The New York Times Company + Sat, 07 Dec 2019 00:15:41 +0000 + + NYT > World News + + https://static01.nyt.com/images/misc/NYT_logo_rss_250x40.png + + + https://www.nytimes.com/section/world?emc=rss&partner=rss + + + + + France Is Hit by Second Day of Pension Strikes as Unions Dig In + + https://www.nytimes.com/2019/12/06/world/europe/france-pension-strike-macron.html?emc=rss&partner=rss + + https://www.nytimes.com/2019/12/06/world/europe/france-pension-strike-macron.html + + + + Transportation was severely disrupted in Paris and other cities, a day after huge protests over government plans to overhaul pensions. Unions are planning more protests next week. + + Aurelien Breeden + Fri, 06 Dec 2019 18:02:13 +0000 + France + Demonstrations, Protests and Riots + Pensions and Retirement Plans + Politics and Government + Strikes + Macron, Emmanuel (1977- ) + + Rafael Yaghobzadeh/Associated Press + + A deserted Gare de Lyon train station in Paris on Friday. Unions are aiming for a protracted strike. + + `; + + auto e = parseRss(xml); + assert(e.items[0].link == "https://www.nytimes.com/2019/12/06/world/europe/france-pension-strike-macron.html?emc=rss&partner=rss", e.items[0].link); + + auto gf = e.toGenericFeed(); + assert(gf.items[0].link == "https://www.nytimes.com/2019/12/06/world/europe/france-pension-strike-macron.html?emc=rss&partner=rss", e.items[0].link); + } + }