mirror of https://github.com/adamdruppe/arsd.git
Document.fromUrl improvement
This commit is contained in:
parent
85275db33d
commit
7d1ae8f27c
6
dom.d
6
dom.d
|
@ -82,7 +82,7 @@ bool isConvenientAttribute(string name) {
|
||||||
class Document : FileResource {
|
class Document : FileResource {
|
||||||
/// Convenience method for web scraping. Requires [arsd.http2] to be
|
/// Convenience method for web scraping. Requires [arsd.http2] to be
|
||||||
/// included in the build as well as [arsd.characterencodings].
|
/// included in the build as well as [arsd.characterencodings].
|
||||||
static Document fromUrl()(string url) {
|
static Document fromUrl()(string url, bool strictMode = false) {
|
||||||
import arsd.http2;
|
import arsd.http2;
|
||||||
auto client = new HttpClient();
|
auto client = new HttpClient();
|
||||||
|
|
||||||
|
@ -90,7 +90,11 @@ class Document : FileResource {
|
||||||
auto res = req.waitForCompletion();
|
auto res = req.waitForCompletion();
|
||||||
|
|
||||||
auto document = new Document();
|
auto document = new Document();
|
||||||
|
if(strictMode) {
|
||||||
|
document.parse(cast(string) res.content, true, true, res.contentTypeCharset);
|
||||||
|
} else {
|
||||||
document.parseGarbage(cast(string) res.content);
|
document.parseGarbage(cast(string) res.content);
|
||||||
|
}
|
||||||
|
|
||||||
return document;
|
return document;
|
||||||
}
|
}
|
||||||
|
|
11
http2.d
11
http2.d
|
@ -160,6 +160,17 @@ struct HttpResponse {
|
||||||
string contentType; /// The content type header
|
string contentType; /// The content type header
|
||||||
string location; /// The location header
|
string location; /// The location header
|
||||||
|
|
||||||
|
/// the charset out of content type, if present. `null` if not.
|
||||||
|
string contentTypeCharset() {
|
||||||
|
auto idx = contentType.indexOf("charset=");
|
||||||
|
if(idx == -1)
|
||||||
|
return null;
|
||||||
|
auto c = contentType[idx + "charset=".length .. $].strip;
|
||||||
|
if(c.length)
|
||||||
|
return c;
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
string[string] cookies; /// Names and values of cookies set in the response.
|
string[string] cookies; /// Names and values of cookies set in the response.
|
||||||
|
|
||||||
string[] headers; /// Array of all headers returned.
|
string[] headers; /// Array of all headers returned.
|
||||||
|
|
62
rss.d
62
rss.d
|
@ -440,10 +440,15 @@ auto test1 = `<?xml version="1.0" encoding="ISO-8859-1"?>
|
||||||
|
|
||||||
{
|
{
|
||||||
auto e = parseRss(test1);
|
auto e = parseRss(test1);
|
||||||
assert(e.items.length = 6);
|
assert(e.items.length == 6);
|
||||||
assert(e.items[$-1].title == "Yournamehere.com more important than anything", e.items[$-1].title);
|
assert(e.items[$-1].title == "Yournamehere.com more important than anything", e.items[$-1].title);
|
||||||
assert(e.items[0].title == "Giving the world a pluggable Gnutella");
|
assert(e.items[0].title == "Giving the world a pluggable Gnutella");
|
||||||
|
assert(e.items[0].link == "http://writetheweb.com/read.php?item=24");
|
||||||
assert(e.image.url == "http://writetheweb.com/images/mynetscape88.gif");
|
assert(e.image.url == "http://writetheweb.com/images/mynetscape88.gif");
|
||||||
|
|
||||||
|
auto df = e.toGenericFeed();
|
||||||
|
assert(df.items.length == 6);
|
||||||
|
assert(df.items[0].link == "http://writetheweb.com/read.php?item=24");
|
||||||
}
|
}
|
||||||
|
|
||||||
auto test2 = `<?xml version="1.0"?>
|
auto test2 = `<?xml version="1.0"?>
|
||||||
|
@ -644,4 +649,59 @@ auto testAtom1 = `<?xml version="1.0" encoding="utf-8"?>
|
||||||
assert(e.entries[0].content.html.length > 10);
|
assert(e.entries[0].content.html.length > 10);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
auto xml = `<rss version="2.0">
|
||||||
|
<channel>
|
||||||
|
<title>NYT > World News</title>
|
||||||
|
<link>
|
||||||
|
https://www.nytimes.com/section/world?emc=rss&partner=rss
|
||||||
|
</link>
|
||||||
|
<atom:link href="https://rss.nytimes.com/services/xml/rss/nyt/World.xml" rel="self" type="application/rss+xml"/>
|
||||||
|
<description/>
|
||||||
|
<language>en-us</language>
|
||||||
|
<copyright>Copyright 2019 The New York Times Company</copyright>
|
||||||
|
<lastBuildDate>Sat, 07 Dec 2019 00:15:41 +0000</lastBuildDate>
|
||||||
|
<image>
|
||||||
|
<title>NYT > World News</title>
|
||||||
|
<url>
|
||||||
|
https://static01.nyt.com/images/misc/NYT_logo_rss_250x40.png
|
||||||
|
</url>
|
||||||
|
<link>
|
||||||
|
https://www.nytimes.com/section/world?emc=rss&partner=rss
|
||||||
|
</link>
|
||||||
|
</image>
|
||||||
|
<item>
|
||||||
|
<title>
|
||||||
|
France Is Hit by Second Day of Pension Strikes as Unions Dig In
|
||||||
|
</title>
|
||||||
|
<link>https://www.nytimes.com/2019/12/06/world/europe/france-pension-strike-macron.html?emc=rss&partner=rss</link>
|
||||||
|
<guid isPermaLink="true">
|
||||||
|
https://www.nytimes.com/2019/12/06/world/europe/france-pension-strike-macron.html
|
||||||
|
</guid>
|
||||||
|
<atom:link href="https://www.nytimes.com/2019/12/06/world/europe/france-pension-strike-macron.html?emc=rss&partner=rss" rel="standout"/>
|
||||||
|
<description>
|
||||||
|
Transportation was severely disrupted in Paris and other cities, a day after huge protests over government plans to overhaul pensions. Unions are planning more protests next week.
|
||||||
|
</description>
|
||||||
|
<dc:creator>Aurelien Breeden</dc:creator>
|
||||||
|
<pubDate>Fri, 06 Dec 2019 18:02:13 +0000</pubDate>
|
||||||
|
<category domain="http://www.nytimes.com/namespaces/keywords/nyt_geo">France</category>
|
||||||
|
<category domain="http://www.nytimes.com/namespaces/keywords/des">Demonstrations, Protests and Riots</category>
|
||||||
|
<category domain="http://www.nytimes.com/namespaces/keywords/des">Pensions and Retirement Plans</category>
|
||||||
|
<category domain="http://www.nytimes.com/namespaces/keywords/des">Politics and Government</category>
|
||||||
|
<category domain="http://www.nytimes.com/namespaces/keywords/des">Strikes</category>
|
||||||
|
<category domain="http://www.nytimes.com/namespaces/keywords/nyt_per">Macron, Emmanuel (1977- )</category>
|
||||||
|
<media:content height="151" medium="image" url="https://static01.nyt.com/images/2019/12/06/world/06france-strikes/merlin_165509820_476d5340-3717-4fbb-b187-097ae7718e48-moth.jpg" width="151"/>
|
||||||
|
<media:credit>Rafael Yaghobzadeh/Associated Press</media:credit>
|
||||||
|
<media:description>
|
||||||
|
A deserted Gare de Lyon train station in Paris on Friday. Unions are aiming for a protracted strike.
|
||||||
|
</media:description>
|
||||||
|
</item></channel></rss>`;
|
||||||
|
|
||||||
|
auto e = parseRss(xml);
|
||||||
|
assert(e.items[0].link == "https://www.nytimes.com/2019/12/06/world/europe/france-pension-strike-macron.html?emc=rss&partner=rss", e.items[0].link);
|
||||||
|
|
||||||
|
auto gf = e.toGenericFeed();
|
||||||
|
assert(gf.items[0].link == "https://www.nytimes.com/2019/12/06/world/europe/france-pension-strike-macron.html?emc=rss&partner=rss", e.items[0].link);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue