如何自动将BibTex引用转换为Zotero可解析的内容



我有一个引文系统,它将用户注释发布到wiki(Researchr)。从程序上讲,我可以访问每个条目的完整BibTeX记录,并且我还将其显示在各个页面上(例如,单击BibTeX)。这是为了方便其他引文管理器的用户自动导入他们感兴趣的论文的引文。我还希望其他引文管理器,尤其是Zotero,能够自动检测和导入引文。

Zotero列出了许多公开它将理解的元数据的方法,包括使用RDF、COiNS、Dublin Core和unAPI的元标签。有没有一个Ruby库可以自动将BibTeX转换为这些标准,或者有一个Javascript库?我可能会创建一些东西,但如果存在一些东西,它会更加健壮(BibTeX有很多发布类型和字段等)。

这里有一个BibTeX2RDF转换器,可能就是您想要的。

unAPI不是一种数据标准,它是一种为Zotero和其他程序提供数据的方式。Zotero导入Bibtex,因此通过unAPI提供Bibtex服务效果良好。Inspire就是这样一个网站的例子:http://inspirehep.net/

现在可以直接在Zotero中导入.bib类型的bibtex文件。然而,我注意到我的bibtex文件通常不如Zotero完整(尤其是它们经常错过DOI),并且我没有找到一个";自动完成";函数(基于bibtex条目中的数据)。

因此,我使用Zotero导入.bib文件,以确保它们都在其中。然后,我运行一个python脚本,它可以为.bib文件中的条目找到所有缺失的DOI,并将它们导出到一个以空格分隔的.txt文件中

# pip install habanero
from habanero import Crossref
import re

def titletodoi(keyword):
    cr = Crossref()
    result = cr.works(query=keyword)
    items = result["message"]["items"]
    item_title = items[0]["title"]
    tmp = ""
    for it in item_title:
        tmp += it
    title = keyword.replace(" ", "").lower()
    title = re.sub(r"W", "", title)
    # print('title: ' + title)
    tmp = tmp.replace(" ", "").lower()
    tmp = re.sub(r"W", "", tmp)
    # print('tmp: ' + tmp)
    if title == tmp:
        doi = items[0]["DOI"]
        return doi
    else:
        return None

def get_dois(titles):
    dois = []
    for title in titles:
        try:
            doi = titletodoi(title)
            print(f"doi={doi}, title={title}")
            if not doi is None:
                dois.append(doi)
        except:
            pass
            # print("An exception occurred")
    print(f"dois={dois}")
    return dois

def read_titles_from_file(filepath):
    with open(filepath) as f:
        lines = f.read().splitlines()
    split_lines = splits_lines(lines)
    return split_lines

def splits_lines(lines):
    split_lines = []
    for line in lines:
        new_lines = line.split(";")
        for new_line in new_lines:
            split_lines.append(new_line)
    return split_lines

def write_dois_to_file(dois, filename, separation_char):
    textfile = open(filename, "w")
    for doi in dois:
        textfile.write(doi + separation_char)
    textfile.close()

filepath = "list_of_titles.txt"
titles = read_titles_from_file(filepath)
dois = get_dois(titles)
write_dois_to_file(dois, "dois_space.txt", " ")
write_dois_to_file(dois, "dois_per_line.txt", "n")

.txt的DOI被输入到Zotero的魔杖中。接下来,我(手动)通过选择最新添加的条目来删除重复项(因为这来自数据最多的魔杖)。

之后,我运行另一个脚本,将.tex.bib文件中的所有引用id更新为Zotero:生成的引用id

# Importing library
import bibtexparser
from bibtexparser.bparser import BibTexParser
from bibtexparser.customization import *
import os, fnmatch
import Levenshtein as lev

# Let's define a function to customize our entries.
# It takes a record and return this record.
def customizations(record):
    """Use some functions delivered by the library
    :param record: a record
    :returns: -- customized record
    """
    record = type(record)
    record = author(record)
    record = editor(record)
    record = journal(record)
    record = keyword(record)
    record = link(record)
    record = page_double_hyphen(record)
    record = doi(record)
    return record

def get_references(filepath):
    with open(filepath) as bibtex_file:
        parser = BibTexParser()
        parser.customization = customizations
        bib_database = bibtexparser.load(bibtex_file, parser=parser)
        # print(bib_database.entries)
    return bib_database

def get_reference_mapping(main_filepath, sub_filepath):
    found_sub = []
    found_main = []
    main_into_sub = []
    main_references = get_references(main_filepath)
    sub_references = get_references(sub_filepath)
    for main_entry in main_references.entries:
        for sub_entry in sub_references.entries:
            # Match the reference ID if 85% similair titles are detected
            lev_ratio = lev.ratio(
                remove_curly_braces(main_entry["title"]).lower(),
                remove_curly_braces(sub_entry["title"]).lower(),
            )
            if lev_ratio > 0.85:
                print(f"lev_ratio={lev_ratio}")
                if main_entry["ID"] != sub_entry["ID"]:
                    print(f'replace: {sub_entry["ID"]} with: {main_entry["ID"]}')
                    main_into_sub.append([main_entry, sub_entry])
                    # Keep track of which entries have been found
                    found_sub.append(sub_entry)
                    found_main.append(main_entry)
    return (
        main_into_sub,
        found_main,
        found_sub,
        main_references.entries,
        sub_references.entries,
    )

def remove_curly_braces(string):
    left = string.replace("{", "")
    right = left.replace("{", "")
    return right

def replace_references(main_into_sub, directory):
    for pair in main_into_sub:
        main = pair[0]["ID"]
        sub = pair[1]["ID"]
        print(f"replace: {sub} with: {main}")
        # UNCOMMENT IF YOU WANT TO ACTUALLY DO THE PRINTED REPLACEMENT
        # findReplace(latex_root_dir, sub, main, "*.tex")
        # findReplace(latex_root_dir, sub, main, "*.bib")

def findReplace(directory, find, replace, filePattern):
    for path, dirs, files in os.walk(os.path.abspath(directory)):
        for filename in fnmatch.filter(files, filePattern):
            filepath = os.path.join(path, filename)
            with open(filepath) as f:
                s = f.read()
            s = s.replace(find, replace)
            with open(filepath, "w") as f:
                f.write(s)

def list_missing(main_references, sub_references):
    for sub in sub_references:
        if not sub["ID"] in list(map(lambda x: x["ID"], main_references)):
            print(f'the following reference has a changed title:{sub["ID"]}')

latex_root_dir = "some_path/"
main_filepath = f"{latex_root_dir}latex/Literature_study/zotero.bib"
sub_filepath = f"{latex_root_dir}latex/Literature_study/references.bib"
(
    main_into_sub,
    found_main,
    found_sub,
    main_references,
    sub_references,
) = get_reference_mapping(main_filepath, sub_filepath)
replace_references(main_into_sub, latex_root_dir)
list_missing(main_references, sub_references)

# For those references which have levenshtein ratio below 85 you can specify a manual swap:
manual_swap = []  # main into sub
# manual_swap.append(["cantley_impact_2021","cantley2021impact"])
# manual_swap.append(["widemann_envision_2021","widemann2020envision"])
for pair in manual_swap:
    main = pair[0]
    sub = pair[1]
    print(f"replace: {sub} with: {main}")
    # UNCOMMENT IF YOU WANT TO ACTUALLY DO THE PRINTED REPLACEMENT
    # findReplace(latex_root_dir, sub, main, "*.tex")
    # findReplace(latex_root_dir, sub, main, "*.bib")

最新更新