使用python从wikidata转储中提取别名



我正试图从wikidata转储中提取有关wikidata项的某些字段,但我对某种语言的aliases字段有问题,我的代码基于以下URL中的代码how_to_use_a_wikidat_dump,我进行了修改,但aliases字段返回空值:

for record in wikidata(args.dumpfile):
print('i = '+str(i)+' item '+record['id']+'  started!'+'n')
item_id = pydash.get(record, 'id')
item_type = pydash.get(record, 'claims.P31[0].mainsnak.datavalue.value.id')
arabic_label = pydash.get(record, 'labels.ar.value')
english_label = pydash.get(record, 'labels.en.value')
arabic_aliases =pydash.get(record, 'aliases.ar.value')
english_aliases =pydash.get(record, 'aliases.en.value')
arabic_desc = pydash.get(record, 'descriptions.ar.value')
english_desc = pydash.get(record, 'descriptions.en.value')
main_category = pydash.get(record, 'claims.P910[0].mainsnak.datavalue.value.id')
arwiki = pydash.get(record, 'sitelinks.arwiki.title')
arwikiquote = pydash.get(record, 'sitelinks.arwikiquote.title')
enwiki = pydash.get(record, 'sitelinks.enwiki.title')
enwikiquote = pydash.get(record, 'sitelinks.enwiki
quote.title')

Wiki数据项的JSON格式可在此处找到:JSON格式

示例JSON RECORD

{
"pageid": 186,
"ns": 0,
"title": "Q60",
"lastrevid": 199780882,
"modified": "2020-02-27T14:37:20Z",
"id": "Q60",
"type": "item",
"aliases": {
"en": [
{
"language": "en",
"value": "NYC"
},
{
"language": "en",
"value": "New York"
}
],
"fr": [
{
"language": "fr",
"value": "New York City"
},
{
"language": "fr",
"value": "NYC"
}
],
"zh-mo": [
{
"language": "zh-mo",
"value": "u7d10u7d04u5e02"
}
]
},
"labels": {
"en": {
"language": "en",
"value": "New York City"
},
"ar": {
"language": "ar",
"value": "u0645u062fu064au0646u0629 u0646u064au0648 u064au0648u0631u0643"
},
"fr": {
"language": "fr",
"value": "New York City"
},
"my": {
"language": "my",
"value": "u1014u101au1030u1038u101au1031u102cu1000u103au1019u103cu102du102fu1037"
},
"ps": {
"language": "ps",
"value": "u0646u064au0648u064au0627u0631u06a9"
}
},
"descriptions": {
"en": {
"language": "en",
"value": "largest city in New York and the United States of America"
},
"it": {
"language": "it",
"value": "cittu00e0 degli Stati Uniti d'America"
},
"pl": {
"language": "pl",
"value": "miasto w Stanach Zjednoczonych"
},
"ro": {
"language": "ro",
"value": "orau015ful cel mai mare din SUA"
}
},
"claims": {
"P1151": [
{
"id": "Q60$6f832804-4c3f-6185-38bd-ca00b8517765",
"mainsnak": {
"snaktype": "value",
"property": "P1151",
"datatype": "wikibase-item",
"datavalue": {
"value": {
"entity-type": "item",
"id": "Q6342720",
"numeric-id": 6342720
},
"type": "wikibase-entityid"
}
},
"type": "statement",
"rank": "normal"
}
],
"P625": [
{
"id": "q60$f00c56de-4bac-e259-b146-254897432868",
"mainsnak": {
"snaktype": "value",
"property": "P625",
"datatype": "globe-coordinate",
"datavalue": {
"value": {
"latitude": 40.67,
"longitude": -73.94,
"altitude": null,
"precision": 0.00027777777777778,
"globe": "http://www.wikidata.org/entity/Q2"
},
"type": "globecoordinate"
}
},
"type": "statement",
"rank": "normal",
"references": [
{
"hash": "7eb64cf9621d34c54fd4bd040ed4b61a88c4a1a0",
"snaks": {
"P143": [
{
"snaktype": "value",
"property": "P143",
"datatype": "wikibase-item",
"datavalue": {
"value": {
"entity-type": "item",
"id": "Q328",
"numeric-id": 328
},
"type": "wikibase-entityid"
}
}
]
},
"snaks-order": [
"P143"
]
}
]
}
],
"P150": [
{
"id": "Q60$bdddaa06-4e4b-f369-8954-2bb010aaa057",
"mainsnak": {
"snaktype": "value",
"property": "P150",
"datatype": "wikibase-item",
"datavalue": {
"value": {
"entity-type": "item",
"id": "Q11299",
"numeric-id": 11299
},
"type": "wikibase-entityid"
}
},
"type": "statement",
"rank": "normal"
},
{
"id": "Q60$0e484d5b-41a5-1594-7ae1-c3768c6206f6",
"mainsnak": {
"snaktype": "value",
"property": "P150",
"datatype": "wikibase-item",
"datavalue": {
"value": {
"entity-type": "item",
"id": "Q18419",
"numeric-id": 18419
},
"type": "wikibase-entityid"
}
},
"type": "statement",
"rank": "normal"
},
{
"id": "Q60$e5000a60-42fc-2aba-f16d-bade1d2e8a58",
"mainsnak": {
"snaktype": "value",
"property": "P150",
"datatype": "wikibase-item",
"datavalue": {
"value": {
"entity-type": "item",
"id": "Q18424",
"numeric-id": 18424
},
"type": "wikibase-entityid"
}
},
"type": "statement",
"rank": "normal"
},
{
"id": "Q60$4d90d6f4-4ab8-26bd-f2a5-4ac2a6eb48cd",
"mainsnak": {
"snaktype": "value",
"property": "P150",
"datatype": "wikibase-item",
"datavalue": {
"value": {
"entity-type": "item",
"id": "Q18426",
"numeric-id": 18426
},
"type": "wikibase-entityid"
}
},
"type": "statement",
"rank": "normal"
},
{
"id": "Q60$ede49e3c-44f6-75a3-eb74-6a89886e30c9",
"mainsnak": {
"snaktype": "value",
"property": "P150",
"datatype": "wikibase-item",
"datavalue": {
"value": {
"entity-type": "item",
"id": "Q18432",
"numeric-id": 18432
},
"type": "wikibase-entityid"
}
},
"type": "statement",
"rank": "normal"
}
],
"P6": [
{
"id": "Q60$5cc8fc79-4807-9800-dbea-fe9c20ab273b",
"mainsnak": {
"snaktype": "value",
"property": "P6",
"datatype": "wikibase-item",
"datavalue": {
"value": {
"entity-type": "item",
"id": "Q4911497",
"numeric-id": 4911497
},
"type": "wikibase-entityid"
}
},
"qualifiers": {
"P580": [
{
"hash": "c53f3ca845b789e543ed45e3e1ecd1dd950e30dc",
"snaktype": "value",
"property": "P580",
"datatype": "time",
"datavalue": {
"value": {
"time": "+00000002014-01-01T00:00:00Z",
"timezone": 0,
"before": 0,
"after": 0,
"precision": 11,
"calendarmodel": "http://www.wikidata.org/entity/Q1985727"
},
"type": "time"
}
}
]
},
"qualifiers-order": [
"P580"
],
"type": "statement",
"rank": "preferred"
},
{
"id": "q60$cad4e313-4b5e-e089-08b9-3b1c7998e762",
"mainsnak": {
"snaktype": "value",
"property": "P6",
"datatype": "wikibase-item",
"datavalue": {
"value": {
"entity-type": "item",
"id": "Q607",
"numeric-id": 607
},
"type": "wikibase-entityid"
}
},
"qualifiers": {
"P580": [
{
"hash": "47c515b79f80e24e03375b327f2ac85184765d5b",
"snaktype": "value",
"property": "P580",
"datatype": "time",
"datavalue": {
"value": {
"time": "+00000002002-01-01T00:00:00Z",
"timezone": 0,
"before": 0,
"after": 0,
"precision": 11,
"calendarmodel": "http://www.wikidata.org/entity/Q1985727"
},
"type": "time"
}
}
],
"P582": [
{
"hash": "1f463f78538c49ef6adf3a9b18e211af7195240a",
"snaktype": "value",
"property": "P582",
"datatype": "time",
"datavalue": {
"value": {
"time": "+00000002013-12-31T00:00:00Z",
"timezone": 0,
"before": 0,
"after": 0,
"precision": 11,
"calendarmodel": "http://www.wikidata.org/entity/Q1985727"
},
"type": "time"
}
}
]
},
"qualifiers-order": [
"P580",
"P582"
]
}
],
"P856": [
{
"id": "Q60$4e3e7a42-4ec4-b7c3-7570-b103eb2bc1ac",
"mainsnak": {
"snaktype": "value",
"property": "P856",
"datatype": "url",
"datavalue": {
"value": "http://nyc.gov/",
"type": "string"
}
},
"type": "statement",
"rank": "normal"
}
]
},
"sitelinks": {
"afwiki": {
"site": "afwiki",
"title": "New York Stad",
"badges": []
},
"dewiki": {
"site": "dewiki",
"title": "New York City",
"badges": [
"Q17437798"
]
},
"dewikinews": {
"site": "dewikinews",
"title": "Kategorie:New York",
"badges": []
},
"elwiki": {
"site": "elwiki",
"title": "u039du03adu03b1 u03a5u03ccu03c1u03bau03b7",
"badges": []
},
"enwiki": {
"site": "enwiki",
"title": "New York City",
"badges": []
},
"zhwikivoyage": {
"site": "zhwikivoyage",
"title": "u7d10u7d04",
"badges": []
},
"zuwiki": {
"site": "zuwiki",
"title": "New York (idolobha)",
"badges": []
}
}
}

该代码的结果是:

english_aliases =pydash.get(record, 'aliases.en')
print(type(arabic_aliases))
print(english_aliases)

<类"列表">[{'语言':"en","value":"Kingdom of Belgium"},{'语文':"en-","value':"BEL"}

答案是:

english_aliases= set()
if pydash.has(record, 'aliases.en'):
for itm in pydash.get(record, 'aliases.en'):
english_aliases.add(itm['value'])

最新更新