使用Zotero翻译获得作者归属



我目前正在做一个项目,我需要从dblp上发表的文章中获得作者从属关系。所以我建立了一个翻译服务器,你可以从他们的github中获得,并遵循其他说明。

然后我在java程序中建立一个连接,如下所示:

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import org.json.*;

public class ZoteroHandler 
{
//Function runing the scan
public static void Scan(Article article) throws Exception
{
    //Setting up an URL HttpURLConnection given DOI
    URL urlDoi = new URL (article.GetElectronicEdition());
    HttpURLConnection connDoi = (HttpURLConnection) urlDoi.openConnection();
    // Make the logic below easier to detect redirections
    connDoi.setInstanceFollowRedirects(false);  
    String doi = "{"url"Smiley unsure"" + connDoi.getHeaderField("Location") + "","sessionid"Smiley unsure"abc123"}";
    //Setting up an URL to translation-server
    URL url = new URL("http://127.0.0.1:1969/web");
    URLConnection conn = url.openConnection();
    conn.setDoOutput(true);
    conn.setRequestProperty("Content-Type", "application/json");
    OutputStreamWriter writer = new OutputStreamWriter(conn.getOutputStream());
    writer.write(doi);
    writer.flush();
    String line;
    BufferedReader reader = new BufferedReader(new InputStreamReader(conn.getInputStream()));
    while ((line = reader.readLine()) != null ) 
    {
        //Used to see of we get something from stream
        System.out.println(line);
        //Incoming is JSONArray, so create new array, fill it then parse it 
        JSONArray jsonArr = new JSONArray(line);
        JSONObject obj = jsonArr.getJSONObject(0);
        //Getting abstracts
        String abstracts = obj.getString("abstractNote");
        System.out.println(abstracts);
        //Setting information in db
        article.SetAbstracts(abstracts);
        DatabaseHandler.GetInstance().UpdateArticle(article);
    }
    writer.close(); 
    reader.close(); 
    //Need to disconnect?
    //((HttpURLConnection) conn).disconnect();
    //connDoi.disconnect();
}

到目前为止一切顺利。我正在获取我想要的信息,并将其存储在抽象字符串中,并将其设置在数据库中。但现在我还需要获得作者归属。因此,我需要以某种方式修改我正在使用的翻译脚本。

脚本如下:

    {
    "translatorID": "5af42734-7cd5-4c69-97fc-bc406999bdba",
    "label": "Atypon Journals",
    "creator": "Sebastian Karcher",
    "target": "^https?://[^?#]+(?:/doi/((?:abs|abstract|full|figure|ref|citedby|book)/)?10\.|/action/doSearch\?)|^https?://[^/]+/toc/",
    "minVersion": "3.0",
    "maxVersion": "",
    "priority": 270,
    "inRepository": true,
    "translatorType": 4,
    "browserSupport": "gcsibv",
    "lastUpdated": "2015-10-15 22:24:05"
}
/*
Atypon Journals Translator
Copyright (C) 2011-2014 Sebastian Karcher
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

function detectWeb(doc, url) 
{
    if (url.search(/^https?://[^/]+/toc/|/action/doSearch?/) != -1) 
    {
        return getSearchResults(doc, true) ? "multiple" : false;
    }
    var citLinks = ZU.xpath(doc, '//a[contains(@href, "/action/showCitFormats")]');
    if (citLinks.length > 0) {
        if (url.indexOf('/doi/book/') != -1) {
            return 'book';
        }
        else if (url.search(/.chd+$/)!=-1){
            return 'bookSection';
        }
        return "journalArticle";
    }
}
function getSearchResults(doc, checkOnly, extras) {
    var articles = {};
    var container = doc.getElementsByName('frmSearchResults')[0]
        || doc.getElementsByName('frmAbs')[0];
    if (!container) {
        Z.debug('Atypon: multiples container not found.');
        return false;
    }
    var rows = container.getElementsByClassName('articleEntry'),
        found = false,
        doiLink = 'a[contains(@href, "/doi/abs/") or contains(@href, "/doi/abstract/") or '
            + 'contains(@href, "/doi/full/") or contains(@href, "/doi/book/")]';
    for (var i = 0; i<rows.length; i++) {
        var title = rows[i].getElementsByClassName('art_title')[0];
        if (!title) continue;
        title = ZU.trimInternal(title.textContent);
        var urlRow = rows[i];
        var url = ZU.xpathText(urlRow, '(.//' + doiLink + ')[1]/@href');
        if (!url) {
            // e.g. http://pubs.rsna.org/toc/radiographics/toc/33/7 shows links in adjacent div
            urlRow = rows[i].nextElementSibling;
            if (!urlRow || urlRow.classList.contains('articleEntry')) continue;
            url = ZU.xpathText(urlRow, '(.//' + doiLink + ')[1]/@href');
        }
        if (!url) continue;
        if (checkOnly) return true;
        found = true;
        if (extras) {
            extras[url] = { pdf: buildPdfUrl(url, urlRow) };
        }
        articles[url] = title;
    }
    if (!found){
        Z.debug("Trying an alternate multiple format");
        var rows = container.getElementsByClassName("item-details");
        for (var i = 0; i<rows.length; i++) {
            var title = ZU.xpathText(rows[i], './h3');
            if (!title) continue;
            title = ZU.trimInternal(title);
            var url = ZU.xpathText(rows[i], '(.//ul[contains(@class, "icon-list")]/li/'
                + doiLink + ')[1]/@href');
            if (!url) continue;
            if (checkOnly) return true;
            found = true;
            if (extras) {
                extras[url] = { pdf: buildPdfUrl(url, rows[i]) };
            }
            articles[url] = title;
        }
    }
    return found ? articles : false;
}
// Keep this in line with target regexp
var replURLRegExp = //doi/((?:abs|abstract|full|figure|ref|citedby|book)/)?/;
function buildPdfUrl(url, root) {
    if (!replURLRegExp.test(url)) return false; // The whole thing is probably going to fail anyway
    var pdfPaths = ['/doi/pdf/', '/doi/pdfplus/'];
    for (var i=0; i<pdfPaths.length; i++) {
        if (ZU.xpath(root, './/a[contains(@href, "' + pdfPaths[i] + '")]').length) {
            return url.replace(replURLRegExp, pdfPaths[i]);
        }
    }
    Z.debug('PDF link not found.')
    if (root.nodeType != 9 /*DOCUMENT_NODE*/) {
        Z.debug('Available links:');
        var links = root.getElementsByTagName('a');
        if (!links.length) Z.debug('No links');
        for (var i=0; i<links.length; i++) {
            Z.debug(links[i].href);
        }
    }
    return false;
}
function doWeb(doc, url) {
    if (detectWeb(doc, url) == "multiple") {
        var extras = {};
        Zotero.selectItems(getSearchResults(doc, false, extras), function (items) {
            if (!items) {
                return true;
            }
            var articles = [];
            for (var itemurl in items) {
                articles.push({
                    url: itemurl.replace(/?prev.+/, ""),
                    extras: extras[itemurl]
                });
            }
            fetchArticles(articles);
        });
    } else {
        scrape(doc, url, {pdf: buildPdfUrl(url, doc)});
    }
}
function fixCase(str, titleCase) {
    if (str.toUpperCase() != str) return str;
    if (titleCase) {
        return ZU.capitalizeTitle(str, true);
    }
    return str.charAt(0) + str.substr(1).toLowerCase();
}
function fetchArticles(articles) {
    if (!articles.length) return;
    var article = articles.shift();
    ZU.processDocuments(article.url, function(doc, url) {
        scrape(doc, url, article.extras);
    },
    function() {
        if (articles.length) fetchArticles(articles);
    });
}
function scrape(doc, url, extras) {
    url = url.replace(/[?#].*/, "");
    var doi = url.match(/10.[^?#]+/)[0];
    var citationurl = url.replace(replURLRegExp, "/action/showCitFormats?doi=");
    var abstract = doc.getElementsByClassName('abstractSection')[0];
    //var authorAffiliation = doc.getElementsByClassName('listGroup')[0];
    var tags = ZU.xpath(doc, '//p[@class="fulltext"]//a[contains(@href, "keyword") or contains(@href, "Keyword=")]');
    Z.debug("Citation URL: " + citationurl);
    ZU.processDocuments(citationurl, function(citationDoc){
        var filename = citationDoc.evaluate('//form//input[@name="downloadFileName"]', citationDoc, null, XPathResult.ANY_TYPE, null).iterateNext().value;
        Z.debug("Filename: " + filename);
        var get = '/action/downloadCitation';
        var post = 'doi=' + doi + '&downloadFileName=' + filename + '&format=ris&direct=true&include=cit';
        ZU.doPost(get, post, function (text) 
        {
            //Z.debug(text);
            var translator = Zotero.loadTranslator("import");
            // Calling the RIS translator
            translator.setTranslator("32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7");
            translator.setString(text);
            translator.setHandler("itemDone", function (obj, item) 
            {
                // Sometimes we get titles and authros in all caps
                item.title = fixCase(item.title);
                for (var i=0; i<item.creators.length; i++) 
                {
                    item.creators[i].lastName = fixCase(item.creators[i].lastName, true);
                    if (item.creators[i].firstName) {
                        item.creators[i].firstName = fixCase(item.creators[i].firstName, true);
                }
            }
                item.url = url;
                //for Emerald, get rid of the "null" that they add at the end of every title:
                if (url.indexOf("www.emeraldinsight.com")!=-1){
                    item.title = item.title.replace(/null$/, "")
                }
                item.notes = [];
                for (var i in tags)
                {
                    item.tags.push(tags[i].textContent)
                }
                if (abstract) 
                {
                    // Drop "Abstract" prefix
                    // This is not excellent, since some abstracts could
                    // conceivably begin with the word "abstract"
                    item.abstractNote = abstract.textContent
                        .replace(/^s*abstracts*/i, '');
                }
                item.attachments = [];
                if (extras.pdf) {
                    item.attachments.push({
                        url: extras.pdf,
                        title: "Full Text PDF",
                        mimeType: "application/pdf"
                    });
                }
                item.attachments.push({
                    document: doc,
                    title: "Snapshot",
                    mimeType: "text/html"
                });
                item.libraryCatalog = url.replace(/^https?://(?:www.)?/, '')
                    .replace(/[/:].*/, '') + " (Atypon)";
                item.complete();
            });
            translator.translate();
        });
    })
}

所以有没有人可以告诉我我需要如何更新脚本,这样我就可以得到作者从属关系?我知道脚本应该去html类"ListGroup"查找作者从属关系。

如果你需要更多的信息,这里是所有zotero翻译器的链接,这里是关于zotero的:

我这样解决了这个问题:

function scrape(doc, url, extras) {
    url = url.replace(/[?#].*/, "");
    var doi = url.match(/10.[^?#]+/)[0];
    var citationurl = url.replace(replURLRegExp, "/action/showCitFormats?doi=");
    //TESTING
    var affiliations = [];
    var affiliation = doc.getElementsByClassName('listGroup');    

    var abstract = doc.getElementsByClassName('abstractSection')[0];
    var tags = ZU.xpath(doc, '//p[@class="fulltext"]//a[contains(@href, "keyword") or contains(@href, "Keyword=")]');
    Z.debug("Citation URL: " + citationurl);
    ZU.processDocuments(citationurl, function(citationDoc){
            var filename = citationDoc.evaluate('//form//input[@name="downloadFileName"]', citationDoc, null, XPathResult.ANY_TYPE, null).iterateNext().value;
            Z.debug("Filename: " + filename);
            var get = '/action/downloadCitation';
            var post = 'doi=' + doi + '&downloadFileName=' + filename + '&format=ris&direct=true&include=cit';
            ZU.doPost(get, post, function (text) {
                    //Z.debug(text);
                    var translator = Zotero.loadTranslator("import");
                    // Calling the RIS translator
                    translator.setTranslator("32d59d2d-b65a-4da4-b0a3-bdd3cfb979e7");
                    translator.setString(text);
                    translator.setHandler("itemDone", function (obj, item) {
                            // Sometimes we get titles and authros in all caps
                            item.title = fixCase(item.title);
                            for (var i=0; i<item.creators.length; i++) {
                                    item.creators[i].lastName = fixCase(item.creators[i].lastName, true);
                                    if (item.creators[i].firstName) {
                                            item.creators[i].firstName = fixCase(item.creators[i].firstName, true);
                                    }
                            }
                            item.url = url;
                            //for Emerald, get rid of the "null" that they add at the end of every title:
                            if (url.indexOf("www.emeraldinsight.com")!=-1){
                                    item.title = item.title.replace(/null$/, "")
                            }
                            item.notes = [];
                            for (var i in tags){
                                    item.tags.push(tags[i].textContent)
                            }
                            if (abstract) {
                                    // Drop "Abstract" prefix
                                    // This is not excellent, since some abstracts could
                                    // conceivably begin with the word "abstract"
                                    item.abstractNote = abstract.textContent
                                            .replace(/^s*abstracts*/i, '');
                            }
                            item.attachments = [];
                            if (extras.pdf) {
                                    item.attachments.push({
                                            url: extras.pdf,
                                            title: "Full Text PDF",
                                            mimeType: "application/pdf"
                                    });
                            }
                            item.attachments.push({
                                    document: doc,
                                    title: "Snapshot",
                                    mimeType: "text/html"
                            });
                            item.libraryCatalog = url.replace(/^https?://(?:www.)?/, '')
                                    .replace(/[/:].*/, '') + " (Atypon)";

                            //Affiliations 
                            for (i=0; i<affiliations.length; i++)
                            {
                                    affiliation.push(affiliations[i].textContent)
                            }
                            item.extra = affiliation.join("; ");

                            item.complete();
                    });
                    translator.translate();
            });
    })

创建了一个名为affiliation的数组和一个名为affiliation的变量。然后用我得到的字符串填充数组,并将其存储在Zotero中名为extra的字段中,这是因为Zotero没有用于作者从属关系的特殊字段。这是一个小技巧我可以把从属关系设置到我的程序

最新更新