/++ RSS/Atom feed reading References: $(LIST * https://cyber.harvard.edu/rss/rss.html * http://www.rssboard.org/rss-specification * https://tools.ietf.org/html/rfc4287 * https://en.wikipedia.org/wiki/Atom_(Web_standard) ) +/ module arsd.rss; import arsd.dom; /// generic subset of rss and atom, normalized for easy consumption struct Feed { string title; /// string description; /// string lastUpdated; /// /// static struct Item { string title; /// string link; /// string description; /// string author; /// string publicationDate; /// string lastUpdatedDate; /// string guid; /// string enclosureUri; /// string enclosureType; /// string enclosureSize; /// } Item[] items; /// } /// enum FeedType { unknown, /// rss, /// atom /// } /// FeedType identifyFeed(Element e) { assert(e !is null); if(e.tagName == "rss") return FeedType.rss; if(e.tagName == "feed" || e.tagName == "atom:feed") return FeedType.atom; return FeedType.unknown; } /// Parses a feed generically Feed parseFeed(Element e) { final switch(identifyFeed(e)) { case FeedType.unknown: throw new Exception("Unknown feed type"); case FeedType.rss: return parseRss(e).toGenericFeed(); case FeedType.atom: return parseAtom(e).toGenericFeed(); } } // application/rss+xml // though some use text/rss+xml or application/rdf+xml // root node of struct RssChannel { string title; string link; string description; string lastBuildDate; // last time content in here changed string pubDate; // format like "Sat, 07 Sep 2002 00:00:01 GMT" when it officially changes string docs; // idk? string cloud; // has domain, port, path, registerProcedure, protocol string language; // optional string copyright; string managingEditor; string webMaster; string category; string ttl; // in minutes, if present RssImage image; RssItem[] items; Feed toGenericFeed() { Feed f; f.title = this.title; f.description = this.description; // FIXME text vs html? f.lastUpdated = this.lastBuildDate; // FIXME: normalize format rss uses "Mon, 18 Nov 2019 12:00:00 GMT" foreach(item; items) { Feed.Item fi; fi.title = item.title; fi.link = item.link; fi.description = item.description; // FIXME: try to normalize text vs html fi.author = item.author; // FIXME fi.publicationDate = item.pubDate; // FIXME fi.guid = item.guid; //fi.lastUpdatedDate; // not available i think fi.enclosureUri = item.enclosure.url; fi.enclosureType = item.enclosure.type; fi.enclosureSize = item.enclosure.length; f.items ~= fi; } return f; } } struct RssImage { string title; /// img alt string url; /// like the img src string link; /// like a href string width; string height; string description; /// img title } struct RssItem { string title; string link; string description; // may have html! string author; string category; string comments; // a link string pubDate; string guid; RssSource source; RssEnclosure enclosure; } struct RssEnclosure { string url; string length; string type; } struct RssSource { string title; string url; } /++ Parses RSS into structs. Requires the element to be RSS; if you are unsure of the type and want a generic response, use parseFeed instead. +/ RssChannel parseRss(Element element) { assert(element !is null && element.tagName == "rss"); RssChannel c; element = element.requireSelector(" > channel"); foreach(memberName; __traits(allMembers, RssChannel)) { static if(memberName == "image") { if(auto image = element.querySelector(" > image")) { RssImage i; foreach(mn; __traits(allMembers, RssImage)) { __traits(getMember, i, mn) = image.optionSelector(" > " ~ mn).innerText; } c.image = i; } } else static if(memberName == "items") { foreach(item; element.querySelectorAll(" > item")) { RssItem i; foreach(mn; __traits(allMembers, RssItem)) { static if(mn == "source") { if(auto s = item.querySelector(" > source")) { i.source.title = s.innerText; i.source.url = s.attrs.url; } } else static if(mn == "enclosure") { if(auto s = item.querySelector(" > enclosure")) { i.enclosure.url = s.attrs.url; i.enclosure.type = s.attrs.type; i.enclosure.length = s.attrs.length; } } else { __traits(getMember, i, mn) = item.optionSelector(" > " ~ mn).innerText; } } c.items ~= i; } } else static if(is(typeof( __traits(getMember, c, memberName).offsetof))) { __traits(getMember, c, memberName) = element.optionSelector(" > " ~ memberName).innerText; } } return c; } /// RssChannel parseRss(string s) { auto document = new Document(s, true, true); return parseRss(document.root); } /* struct SyndicationInfo { string updatePeriod; // sy:updatePeriod string updateFrequency; string updateBase; string skipHours; // stored as elements string skipDays; // stored as elements } */ // /////////////////// atom //////////////////// // application/atom+xml /+ rss vs atom date format is different atom:xxx links root node is , organization has no , and instead of +/ /++ +/ struct AtomFeed { string title; /// has a type attribute - text or html string subtitle; /// has a type attribute string updated; /// io string string id; /// string link; /// i want the text/html type really, certainly not rel=self string rights; /// string generator; /// AtomEntry[] entries; /// /// Feed toGenericFeed() { Feed feed; feed.title = this.title; feed.description = this.subtitle; feed.lastUpdated = this.updated; // FIXME: normalize the format is 2005-07-31T12:29:29Z foreach(entry; this.entries) { Feed.Item item; item.title = entry.title; item.link = entry.link; item.description = entry.summary.html.length ? entry.summary.html : entry.summary.text; // FIXME item.author = entry.author.email; // FIXME normalize; RSS does "email (name)" item.publicationDate = entry.published; // FIXME the format is 2005-07-31T12:29:29Z item.lastUpdatedDate = entry.updated; item.guid = entry.id; item.enclosureUri = entry.enclosure.url; item.enclosureType = entry.enclosure.type; item.enclosureSize = entry.enclosure.length; feed.items ~= item; } return feed; } } /// struct AtomEntry { string title; /// string link; /// the alternate AtomEnclosure enclosure; /// string id; /// string updated; /// string published; /// AtomPerson author; /// AtomPerson[] contributors; /// AtomContent content; /// // should check type. may also have a src element for a link. type of html is escaped, type of xhtml is embedded. AtomContent summary; /// } /// struct AtomEnclosure { string url; /// string length; /// string type; /// } /// struct AtomContent { string text; /// string html; /// } /// struct AtomPerson { string name; /// string uri; /// string email; /// } /// AtomFeed parseAtom(Element ele) { AtomFeed af; af.title = ele.optionSelector(` > title, > atom\:title`).innerText; af.subtitle = ele.optionSelector(` > subtitle, > atom\:subtitle`).innerText; af.id = ele.optionSelector(` > id, > atom\:id`).innerText; af.updated = ele.optionSelector(` > updated, > atom\:updated`).innerText; af.rights = ele.optionSelector(` > rights, > atom\:rights`).innerText; af.generator = ele.optionSelector(` > generator, > atom\:generator`).innerText; af.link = ele.optionSelector(` > link:not([rel])`).getAttribute("href"); foreach(entry; ele.querySelectorAll(` > entry`)) { AtomEntry ae; ae.title = entry.optionSelector(` > title, > atom\:title`).innerText; ae.updated = entry.optionSelector(` > updated, > atom\:updated`).innerText; ae.published = entry.optionSelector(` > published, > atom\:published`).innerText; ae.id = entry.optionSelector(` > id, > atom\:id`).innerText; ae.link = entry.optionSelector(` > link:not([rel]), > link[rel=alternate], > link[type="type/html"]`).getAttribute("href"); if(auto enclosure = entry.querySelector(` > link[rel=enclosure]`)) { ae.enclosure.url = enclosure.attrs.href; ae.enclosure.length = enclosure.attrs.length; ae.enclosure.type = enclosure.attrs.type; } if(auto author = entry.querySelector(` > author`)) { ae.author.name = author.optionSelector(` > name`).innerText; ae.author.uri = author.optionSelector(` > uri`).innerText; ae.author.email = author.optionSelector(` > email`).innerText; } foreach(contributor; entry.querySelectorAll(` > contributor`)) { AtomPerson c; c.name = contributor.optionSelector(` > name`).innerText; c.uri = contributor.optionSelector(` > uri`).innerText; c.email = contributor.optionSelector(` > email`).innerText; ae.contributors ~= c; } if(auto e = entry.querySelector("content[type=xhtml]")) ae.content.html = e.innerHTML; if(auto e = entry.querySelector("content[type=html]")) ae.content.html = e.innerText; if(auto e = entry.querySelector("content[type=text], content:not([type])")) ae.content.text = e.innerText; if(auto e = entry.querySelector("summary[type=xhtml]")) ae.summary.html = e.innerHTML; if(auto e = entry.querySelector("summary[type=html]")) ae.summary.html = e.innerText; if(auto e = entry.querySelector("summary[type=text], summary:not([type])")) ae.summary.text = e.innerText; af.entries ~= ae; } return af; } AtomFeed parseAtom(string s) { auto document = new Document(s, true, true); return parseAtom(document.root); } unittest { auto test1 = ` WriteTheWeb http://writetheweb.com News for web users that write back en-us Copyright 2000, WriteTheWeb team. editor@writetheweb.com webmaster@writetheweb.com WriteTheWeb http://writetheweb.com/images/mynetscape88.gif http://writetheweb.com 88 31 News for web users that write back Giving the world a pluggable Gnutella http://writetheweb.com/read.php?item=24 WorldOS is a framework on which to build programs that work like Freenet or Gnutella -allowing distributed applications using peer-to-peer routing. Syndication discussions hot up http://writetheweb.com/read.php?item=23 After a period of dormancy, the Syndication mailing list has become active again, with contributions from leaders in traditional media and Web syndication. Personal web server integrates file sharing and messaging http://writetheweb.com/read.php?item=22 The Magi Project is an innovative project to create a combined personal web server and messaging system that enables the sharing and synchronization of information across desktop, laptop and palmtop devices. Syndication and Metadata http://writetheweb.com/read.php?item=21 RSS is probably the best known metadata format around. RDF is probably one of the least understood. In this essay, published on my O'Reilly Network weblog, I argue that the next generation of RSS should be based on RDF. UK bloggers get organised http://writetheweb.com/read.php?item=20 Looks like the weblogs scene is gathering pace beyond the shores of the US. There's now a UK-specific page on weblogs.com, and a mailing list at egroups. Yournamehere.com more important than anything http://writetheweb.com/read.php?item=19 Whatever you're publishing on the web, your site name is the most valuable asset you have, according to Carl Steadman. `; { auto e = parseRss(test1); assert(e.items.length == 6); assert(e.items[$-1].title == "Yournamehere.com more important than anything", e.items[$-1].title); assert(e.items[0].title == "Giving the world a pluggable Gnutella"); assert(e.items[0].link == "http://writetheweb.com/read.php?item=24"); assert(e.image.url == "http://writetheweb.com/images/mynetscape88.gif"); auto df = e.toGenericFeed(); assert(df.items.length == 6); assert(df.items[0].link == "http://writetheweb.com/read.php?item=24"); } auto test2 = ` Dave Winer: Grateful Dead http://www.scripting.com/blog/categories/gratefulDead.html A high-fidelity Grateful Dead song every day. This is where we're experimenting with enclosures on RSS news items that download when you're not using your computer. If it works (it will) it will be the end of the Click-And-Wait multimedia experience on the Internet. Fri, 13 Apr 2001 19:23:02 GMT http://backend.userland.com/rss092 dave@userland.com (Dave Winer) dave@userland.com (Dave Winer) It's been a few days since I added a song to the Grateful Dead channel. Now that there are all these new Radio users, many of whom are tuned into this channel (it's #16 on the hotlist of upstreaming Radio users, there's no way of knowing how many non-upstreaming users are subscribing, have to do something about this..). Anyway, tonight's song is a live version of Weather Report Suite from Dick's Picks Volume 7. It's wistful music. Of course a beautiful song, oft-quoted here on Scripting News. <i>A little change, the wind and rain.</i> Kevin Drennan started a <a href="http://deadend.editthispage.com/">Grateful Dead Weblog</a>. Hey it's cool, he even has a <a href="http://deadend.editthispage.com/directory/61">directory</a>. <i>A Frontier 7 feature.</i> Scripting News <a href="http://arts.ucsc.edu/GDead/AGDL/other1.html">The Other One</a>, live instrumental, One From The Vault. Very rhythmic very spacy, you can listen to it many times, and enjoy something new every time. This is a test of a change I just made. Still diggin.. The HTML rendering almost <a href="http://validator.w3.org/check/referer">validates</a>. Close. Hey I wonder if anyone has ever published a style guide for ALT attributes on images? What are you supposed to say in the ALT attribute? I sure don't know. If you're blind send me an email if u cn rd ths. <a href="http://www.cs.cmu.edu/~mleone/gdead/dead-lyrics/Franklin's_Tower.txt">Franklin's Tower</a>, a live version from One From The Vault. Moshe Weitzman says Shakedown Street is what I'm lookin for for tonight. I'm listening right now. It's one of my favorites. "Don't tell me this town ain't got no heart." Too bright. I like the jazziness of Weather Report Suite. Dreamy and soft. How about The Other One? "Spanish lady come to me.." Scripting News <a href="http://www.scripting.com/mp3s/youWinAgain.mp3">The news is out</a>, all over town..<p> You've been seen, out runnin round. <p> The lyrics are <a href="http://www.cs.cmu.edu/~mleone/gdead/dead-lyrics/You_Win_Again.txt">here</a>, short and sweet. <p> <i>You win again!</i> <a href="http://www.getlyrics.com/lyrics/grateful-dead/wake-of-the-flood/07.htm">Weather Report Suite</a>: "Winter rain, now tell me why, summers fade, and roses die? The answer came. The wind and rain. Golden hills, now veiled in grey, summer leaves have blown away. Now what remains? The wind and rain." <a href="http://arts.ucsc.edu/gdead/agdl/darkstar.html">Dark Star</a> crashes, pouring its light into ashes. DaveNet: <a href="http://davenet.userland.com/2001/01/21/theUsBlues">The U.S. Blues</a>. Still listening to the US Blues. <i>"Wave that flag, wave it wide and high.."</i> Mistake made in the 60s. We gave our country to the assholes. Ah ah. Let's take it back. Hey I'm still a hippie. <i>"You could call this song The United States Blues."</i> <a href="http://www.sixties.com/html/garcia_stack_0.html"><img src="http://www.scripting.com/images/captainTripsSmall.gif" height="51" width="42" border="0" hspace="10" vspace="10" align="right"></a>In celebration of today's inauguration, after hearing all those great patriotic songs, America the Beautiful, even The Star Spangled Banner made my eyes mist up. It made my choice of Grateful Dead song of the night realllly easy. Here are the <a href="http://searchlyrics2.homestead.com/gd_usblues.html">lyrics</a>. Click on the audio icon to the left to give it a listen. "Red and white, blue suede shoes, I'm Uncle Sam, how do you do?" It's a different kind of patriotic music, but man I love my country and I love Jerry and the band. <i>I truly do!</i> Grateful Dead: "Tennessee, Tennessee, ain't no place I'd rather be." Ed Cone: "Had a nice Deadhead experience with my wife, who never was one but gets the vibe and knows and likes a lot of the music. Somehow she made it to the age of 40 without ever hearing Wharf Rat. We drove to Jersey and back over Christmas with the live album commonly known as Skull and Roses in the CD player much of the way, and it was cool to see her discover one the band's finest moments. That song is unique and underappreciated. Fun to hear that disc again after a few years off -- you get Jerry as blues-guitar hero on Big Railroad Blues and a nice version of Bertha." <a href="http://arts.ucsc.edu/GDead/AGDL/fotd.html">Tonight's Song</a>: "If I get home before daylight I just might get some sleep tonight." <a href="http://arts.ucsc.edu/GDead/AGDL/uncle.html">Tonight's song</a>: "Come hear Uncle John's Band by the river side. Got some things to talk about here beside the rising tide." <a href="http://www.cs.cmu.edu/~mleone/gdead/dead-lyrics/Me_and_My_Uncle.txt">Me and My Uncle</a>: "I loved my uncle, God rest his soul, taught me good, Lord, taught me all I know. Taught me so well, I grabbed that gold and I left his dead ass there by the side of the road." Truckin, like the doo-dah man, once told me gotta play your hand. Sometimes the cards ain't worth a dime, if you don't lay em down. Two-Way-Web: <a href="http://www.thetwowayweb.com/payloadsForRss">Payloads for RSS</a>. "When I started talking with Adam late last year, he wanted me to think about high quality video on the Internet, and I totally didn't want to hear about it." A touch of gray, kinda suits you anyway.. <a href="http://www.sixties.com/html/garcia_stack_0.html"><img src="http://www.scripting.com/images/captainTripsSmall.gif" height="51" width="42" border="0" hspace="10" vspace="10" align="right"></a>In celebration of today's inauguration, after hearing all those great patriotic songs, America the Beautiful, even The Star Spangled Banner made my eyes mist up. It made my choice of Grateful Dead song of the night realllly easy. Here are the <a href="http://searchlyrics2.homestead.com/gd_usblues.html">lyrics</a>. Click on the audio icon to the left to give it a listen. "Red and white, blue suede shoes, I'm Uncle Sam, how do you do?" It's a different kind of patriotic music, but man I love my country and I love Jerry and the band. <i>I truly do!</i> `; { auto e = parseRss(test2); assert(e.items[$-1].enclosure.url == "http://www.scripting.com/mp3s/usBlues.mp3"); } auto test3 = ` Liftoff News http://liftoff.msfc.nasa.gov/ Liftoff to Space Exploration. en-us Tue, 10 Jun 2003 04:00:00 GMT Tue, 10 Jun 2003 09:41:01 GMT http://blogs.law.harvard.edu/tech/rss Weblog Editor 2.0 editor@example.com webmaster@example.com Star City http://liftoff.msfc.nasa.gov/news/2003/news-starcity.asp How do Americans get ready to work with Russians aboard the International Space Station? They take a crash course in culture, language and protocol at Russia's <a href="http://howe.iki.rssi.ru/GCTC/gctc_e.htm">Star City</a>. Tue, 03 Jun 2003 09:39:21 GMT http://liftoff.msfc.nasa.gov/2003/06/03.html#item573 Sky watchers in Europe, Asia, and parts of Alaska and Canada will experience a <a href="http://science.nasa.gov/headlines/y2003/30may_solareclipse.htm">partial eclipse of the Sun</a> on Saturday, May 31st. Fri, 30 May 2003 11:06:42 GMT http://liftoff.msfc.nasa.gov/2003/05/30.html#item572 The Engine That Does More http://liftoff.msfc.nasa.gov/news/2003/news-VASIMR.asp Before man travels to Mars, NASA hopes to design new engines that will let us fly through the Solar System more quickly. The proposed VASIMR engine would do that. Tue, 27 May 2003 08:37:32 GMT http://liftoff.msfc.nasa.gov/2003/05/27.html#item571 Astronauts' Dirty Laundry http://liftoff.msfc.nasa.gov/news/2003/news-laundry.asp Compared to earlier spacecraft, the International Space Station has many luxuries, but laundry facilities are not one of them. Instead, astronauts have other options. Tue, 20 May 2003 08:56:02 GMT http://liftoff.msfc.nasa.gov/2003/05/20.html#item570 `; auto testAtom1 = ` Example Feed A subtitle. urn:uuid:60a76c80-d399-11d9-b91C-0003939e0af6 2003-12-13T18:30:02Z Atom-Powered Robots Run Amok urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a 2003-12-13T18:30:02Z Some text.

This is the entry content.

John Doe johndoe@example.com
`; { auto e = parseAtom(testAtom1); assert(e.entries.length == 1); assert(e.link == "http://example.org/"); assert(e.title == "Example Feed"); assert(e.entries[0].title == "Atom-Powered Robots Run Amok"); assert(e.entries[0].link == "http://example.org/2003/12/13/atom03", e.entries[0].link); assert(e.entries[0].summary.text == "Some text.", e.entries[0].summary.text); assert(e.entries[0].summary.html.length == 0); assert(e.entries[0].content.text.length == 0); assert(e.entries[0].content.html.length > 10); } { auto xml = ` NYT > World News https://www.nytimes.com/section/world?emc=rss&partner=rss en-us Copyright 2019 The New York Times Company Sat, 07 Dec 2019 00:15:41 +0000 NYT > World News https://static01.nyt.com/images/misc/NYT_logo_rss_250x40.png https://www.nytimes.com/section/world?emc=rss&partner=rss France Is Hit by Second Day of Pension Strikes as Unions Dig In https://www.nytimes.com/2019/12/06/world/europe/france-pension-strike-macron.html?emc=rss&partner=rss https://www.nytimes.com/2019/12/06/world/europe/france-pension-strike-macron.html Transportation was severely disrupted in Paris and other cities, a day after huge protests over government plans to overhaul pensions. Unions are planning more protests next week. Aurelien Breeden Fri, 06 Dec 2019 18:02:13 +0000 France Demonstrations, Protests and Riots Pensions and Retirement Plans Politics and Government Strikes Macron, Emmanuel (1977- ) Rafael Yaghobzadeh/Associated Press A deserted Gare de Lyon train station in Paris on Friday. Unions are aiming for a protracted strike. `; auto e = parseRss(xml); assert(e.items[0].link == "https://www.nytimes.com/2019/12/06/world/europe/france-pension-strike-macron.html?emc=rss&partner=rss", e.items[0].link); auto gf = e.toGenericFeed(); assert(gf.items[0].link == "https://www.nytimes.com/2019/12/06/world/europe/france-pension-strike-macron.html?emc=rss&partner=rss", e.items[0].link); } }