Reuters translator

1 view
Skip to first unread message

Patrick Flaherty

unread,
Nov 19, 2009, 9:53:05 PM11/19/09
to zoter...@googlegroups.com
Hi all,
I posted this on the forum but no reply.

The translator has issues with grabbing articles that are different from the main site, i.e http://in.reuters.com. That can be change by changing target to "http://(.*\.)?reuters.com/",

The second issue has to do with bylines. It breaks when it meets a non-standard byline. Reuters tend to have a fair amount of non-standard bylines. The else statement in the translator doesn't seem to grab those non-standard. I'm not sure if it's even possible to capture all the varieties. I would recommend removing the else statement in byline match.

Thanks
Patrick
I pasted the 2.0 code translator code that I've been using. I tend to grab about a dozen Reuters articles daily and so far have not experience any problems with the code below

{
"translatorID":"83979786-44af-494a-9ddb-46654e0486ef",
"translatorType":4,
"label":"Reuters",
"creator":"Michael Berkowitz",
"target":"http://(.*\.)?reuters.com/",
"minVersion":"1.0.0b4.r5",
"maxVersion":"",
"priority":100,
"inRepository":true,
"lastUpdated":"2008-07-07 14:50:00"
}

function detectWeb(doc, url) {
if (url.match(/article/)) {
return "newspaperArticle";
}
}

function doWeb(doc, url) {
var item = new Zotero.Item("newspaperArticle");

item.title = Zotero.Utilities.trimInternal(doc.evaluate('//div[@class="article primaryContent"]/h1', doc, null, XPathResult.ANY_TYPE, null).iterateNext().textContent);
item.date = doc.evaluate('//div[@class="timestampHeader"]', doc, null, XPathResult.ANY_TYPE, null).iterateNext().textContent.match(/^.*\d{4}/)[0];
var byline = doc.evaluate('//div[@id="resizeableText"]/p[1]', doc, null, XPathResult.ANY_TYPE, null).iterateNext().textContent;
if (byline.match(/^By/)) {
var authors = byline.substr(3).split(',');
for each (var aut in authors) {
item.creators.push(Zotero.Utilities.cleanAuthor(aut, "author"));
}
item.abstractNote = doc.evaluate('//div[@id="resizeableText"]/p[2]', doc, null, XPathResult.ANY_TYPE, null).iterateNext().textContent.match(/\-\s+(.*)$/)[1];
}
item.url = url;
item.complete();
}
Reply all
Reply to author
Forward
0 new messages