Commit 5e2bfa0b authored by ufzfq's avatar ufzfq
Browse files

script to create datasets for wikidata, example output and example input

parents
import json
# def extract_id_from_resource_uri(resource_uri):
# resource_uri = entry['series']
# q_index = resource_uri.find('Q')
# return resource_uri[q_index:]
# series = []
# with open('TVSeries.json', encoding="utf8") as json_file:
# data = json.load(json_file)
# for entry in data:
# series_uri = entry['series']
# series.append(extract_id_from_resource_uri(series_uri))
import requests
import time
from tqdm import tqdm
class PropertyExtractor:
def __init__(self, prop_name, clear_name):
self.prop_name = prop_name
self.clear_name = clear_name
def extract(self, claims):
"""specify the structure of how to extract the desired values for the property
data must look like: {'wd_name' : prop_name, 'clear_name' : clear_name, 'properties' : data different for each extractor}
Args:
claims (dict): a dict containing all properties (PXXXX) for an entity (QXXXX)
"""
pass
def can_extract(self, claims):
return self.prop_name in claims
def extract_property_values(self, extracted_data):
return extracted_data
def remember_property_vals(self, all_prop_vals, extracted_data):
"""add all values for the property that are themselves complex objects to a global set
Args:
all_prop_vals (set): holds all QXXXX values that were extracted for this dataset
extracted_data (dict): the data with a structure that was generated by the extract method
"""
pass
def add_label_to_data(self, data, prop_val_to_label_map):
"""once the labels for the QXXXX property values are collected, add them to the previously extracted data
Args:
data (dict): the data with a structure that was generated by the extract method
prop_val_to_label_map (dict): map from QXXXX to their respective label
"""
pass
def get_prop_name(self):
return self.prop_name
def get_clear_name(self):
return self.clear_name
class GenericComplexValueExtractor(PropertyExtractor):
"""generic extractor for properties that link to other entities (complex data types)
"""
def __init__(self, prop_name, clear_name):
super().__init__(prop_name, clear_name)
def extract(self, claims):
genres = []
result = {"wd_name":self.prop_name, "clear_name": self.clear_name, "data" : genres}
if not self.can_extract(claims):
# print('failed to extract: ', self.prop_name, ' -> ', self.clear_name)
return result
claim = claims[self.prop_name]
for genre in claim:
try:
mainsnak = genre['mainsnak']
entity_id = mainsnak['datavalue']['value']['id']
genres.append(entity_id)
except:
print('failed to extract from: ', value)
return result
def remember_property_vals(self, all_prop_vals, extracted_data):
for genre in extracted_data:
all_prop_vals.add(genre)
def add_label_to_data(self, data, prop_val_to_label_map):
result = []
for genre in data:
if genre in prop_val_to_label_map:
label = prop_val_to_label_map[genre]
result.append({"label" : label, "id" : genre})
else:
print('no label for value: ', genre, 'in extractor for: ', self.prop_name)
return result
class PrimitiveValueExtractor(PropertyExtractor):
def __init__(self, prop_name, clear_name):
super().__init__(prop_name, clear_name)
def extract(self, claims):
pass
def remember_property_vals(self, all_prop_vals, extracted_data):
pass
def add_label_to_data(self, data, prop_val_to_label_map):
return data
class DateValueExtractor(PrimitiveValueExtractor):
def __init__(self, prop_name, clear_name):
super().__init__(prop_name, clear_name)
def extract(self, claims):
values = []
result = {"wd_name":self.prop_name, "clear_name": self.clear_name, "data" : values}
if not self.can_extract(claims):
# print('failed to extract: ', self.prop_name, ' -> ', self.clear_name)
return result
claim = claims[self.prop_name]
for value in claim:
try:
mainsnak = value['mainsnak']
val = mainsnak['datavalue']['value']['time']
values.append(val)
except:
print('failed to extract from: ', value)
return result
class QuantityValueExtractor(PrimitiveValueExtractor):
def __init__(self, prop_name, clear_name):
super().__init__(prop_name, clear_name)
def extract(self, claims):
values = []
result = {"wd_name":self.prop_name, "clear_name": self.clear_name, "data" : values}
if not self.can_extract(claims):
# print('failed to extract: ', self.prop_name, ' -> ', self.clear_name)
return result
claim = claims[self.prop_name]
for value in claim:
try:
mainsnak = value['mainsnak']
val = mainsnak['datavalue']['value']['amount']
values.append(val)
except:
print('failed to extract from: ', value)
return result
class LogoImageExtractor(PrimitiveValueExtractor):
def __init__(self):
super().__init__("P154", "logo_image")
def extract(self, claims):
values = []
result = {"wd_name":self.prop_name, "clear_name": self.clear_name, "data" : values}
if not self.can_extract(claims):
# print('failed to extract: ', self.prop_name, ' -> ', self.clear_name)
return result
claim = claims[self.prop_name]
for value in claim:
try:
mainsnak = value['mainsnak']
val = mainsnak['datavalue']['value']
values.append(self.create_commons_link(val))
except:
print('failed to extract from: ', value)
return result
def create_commons_link(self, logo_name):
tokens = logo_name.split(' ')
joined = "%20".join(tokens)
return "http://commons.wikimedia.org/wiki/Special:FilePath/" + joined
class StringValueExtractor(PrimitiveValueExtractor):
def __init__(self, prop_name, clear_name):
super().__init__(prop_name, clear_name)
def extract(self, claims):
values = []
result = {"wd_name":self.prop_name, "clear_name": self.clear_name, "data" : values}
if not self.can_extract(claims):
# print('failed to extract: ', self.prop_name, ' -> ', self.clear_name)
return result
claim = claims[self.prop_name]
for value in claim:
try:
mainsnak = value['mainsnak']
val = mainsnak['datavalue']['value']
values.append(val)
except:
print('failed to extract from: ', value)
return result
class CastMemberExtractor(PropertyExtractor):
def __init__(self):
super().__init__("P161", "cast")
def extract(self, claims):
members = []
result = {"wd_name":self.prop_name, "clear_name": self.clear_name, "data" : members}
if not self.can_extract(claims):
# print('failed to extract: ', self.prop_name, ' -> cast_members')
return result
claim = claims[self.prop_name]
for cast_member in claim:
cast_id = ""
try:
mainsnak = cast_member['mainsnak']
cast_id = mainsnak['datavalue']['value']['id']
# print(cast_member.keys())
except:
print('failed to extract cast id for: ', cast_member)
role_id = ""
try:
qualifiers = cast_member['qualifiers']
if 'P453' in qualifiers:
role_id = qualifiers['P453'][0]['datavalue']['value']['id']
except:
# print('hmmmm -> ', cast_id)
pass
members.append({"cast_id" : cast_id, "role_id" : role_id})
return result
def remember_property_vals(self, all_prop_vals, extracted_data):
for cast_info in extracted_data:
role_id = cast_info['role_id']
if len(role_id) > 2:
all_prop_vals.add(role_id)
all_prop_vals.add(cast_info['cast_id'])
def add_label_to_data(self, data, prop_val_to_label_map):
result = []
for cast_info in data:
updated_cast_info = {}
role_id = cast_info['role_id']
# if len(role_id) < 2:
# print('missing role id')
if role_id in prop_val_to_label_map:
label = prop_val_to_label_map[role_id]
updated_cast_info['character'] = {"character_name" : label, "character_id" : role_id}
cast_id = cast_info['cast_id']
updated_cast_info["cast_id"] = cast_id
if cast_id in prop_val_to_label_map:
label = prop_val_to_label_map[cast_id]
updated_cast_info["cast_name"] = label
else:
print('no label for value: ', cast_info, 'in extractor for: ', self.prop_name)
result.append(updated_cast_info)
return result
class WBGetEntitiesBuilder:
def build_entities_with_properties(self, entities) -> str:
base = "https://www.wikidata.org/w/api.php?action=wbgetentities&ids="
entities_conc = self.concat_entities(entities)
properties = "&props=labels%7Cdescriptions%7Cclaims%7Csitelinks/urls&languages=en&format=json"
return base + entities_conc + properties
def concat_entities(self, entities) -> str:
return "|".join(list(filter(lambda x: len(x) > 1, entities)))
def build_entites_labels(self, entities) -> str:
base = "https://www.wikidata.org/w/api.php?action=wbgetentities&ids="
entities_conc = self.concat_entities(entities)
properties = "&props=labels&languages=en&format=json"
return base + entities_conc + properties
class DatasetCreator:
def split_list_into_chunks(l, chunk_size):
for i in range(0, len(l), chunk_size):
yield l[i:i + chunk_size]
def extract_properties_for_entity(claims, entity, extractors):
result = []
for extractor in extractors:
extractor_result = extractor.extract(claims)
result.append(extractor_result)
return result
def extract_properties_for_entities(data_json, chunk, extractors):
result = {}
for entity in chunk:
claims_for_entity = data_json[entity]['claims']
properties = extract_properties_for_entity(claims_for_entity, entity, extractors)
chunk_properties = {"entity_id" : entity, "label" : "", "properties" : properties}
result[entity] = {"properties" : properties}
return result
def remember_property_values(property_value_maps, all_prop_values, property_to_extractor_map):
for entity in property_value_maps:
props = property_value_maps[entity]['properties']
for prop in props:
prop_name = prop['wd_name']
extractor = property_to_extractor_map[prop_name]
extractor.remember_property_vals(all_prop_values, prop['data'])
def build_prop_val_to_label_map(all_property_values, query_builder):
props_list = list(all_property_values)
chunks = split_list_into_chunks(props_list, 50)
result = {}
for chunk in tqdm(chunks):
flattened_chunk = flatten_chunk(chunk)
query = query_builder.build_entites_labels(flattened_chunk)
print(query)
data = requests.get(query)
data_json = data.json()
labels = data_json['entities']
for entity in flattened_chunk:
if entity in labels:
try:
result[entity] = labels[entity]['labels']['en']['value']
except:
print('No en label available for: ', entity)
else:
print('No available label for entity: ', entity)
time.sleep(1)
return result
def update_prop_vals_in_result(entity_result, prop_vals_to_labels, property_to_extractor_map):
for entity in entity_result:
props = entity_result[entity]['properties']
updated_prop_vals = []
for prop in props:
prop_id = prop['wd_name']
extractor = property_to_extractor_map[prop_id]
adjusted_prop_vals = extractor.add_label_to_data(prop['data'], prop_vals_to_labels)
updated_prop_vals.append({"wd_name" : prop_id, "clear_name" : prop['clear_name'], "data" : adjusted_prop_vals})
entity_result[entity]['properties'] = updated_prop_vals
entity_result[entity]['label'] = prop_vals_to_labels[entity]
def build_property_to_extractor_map(extractors):
prop_to_ext = {}
for extractor in extractors:
prop_to_ext[extractor.get_prop_name()] = extractor
return prop_to_ext
def flatten_chunk(chunk):
result = []
for c in chunk:
result.append(c)
return result
def create_data_set(entities, extractors):
"""create a dataset from wikidata for a given domain and specified properties
Args:
entities (list): QXXXX codes of wikidata entities of a common domain
extractors (list): PropertyExtractors defined for the properties that shall be extracted for each entity
"""
chunked_list = split_list_into_chunks(entities, 10)
query_builder = WBGetEntitiesBuilder()
property_to_extractor_map = build_property_to_extractor_map(extractors)
all_property_values = set()
entity_results = {}
for chunk in tqdm(chunked_list):
flattened_chunk = flatten_chunk(chunk)
#print(flattened_chunk)
query = query_builder.build_entities_with_properties(list(flattened_chunk))
data = requests.get(query)
data_json = data.json()
# print(data_json)
#List of {'entity_id': entity_id, 'properties' : [property : [property_values]]}}
properties_for_entities = extract_properties_for_entities(data_json['entities'], flattened_chunk, extractors)
# for p in properties_for_entities:
# entity_results.append(p)
entity_results.update(properties_for_entities)
time.sleep(3)
remember_property_values(entity_results, all_property_values, property_to_extractor_map)
for entity in entities:
all_property_values.add(entity)
prop_vals_to_labels = build_prop_val_to_label_map(all_property_values, query_builder)
update_prop_vals_in_result(entity_results, prop_vals_to_labels, property_to_extractor_map)
return entity_results
extractors = []
extractors.append(GenericComplexValueExtractor('P136', 'genres'))
extractors.append(GenericComplexValueExtractor('P170', 'creators'))
extractors.append(GenericComplexValueExtractor('P58', 'screenwriters'))
extractors.append(GenericComplexValueExtractor('P17', 'countries'))
extractors.append(GenericComplexValueExtractor('P921', 'main_subjects'))
extractors.append(GenericComplexValueExtractor('P57', 'directors'))
extractors.append(GenericComplexValueExtractor('P86', 'composers'))
extractors.append(GenericComplexValueExtractor('P344', 'directors_of_photography'))
extractors.append(GenericComplexValueExtractor('P449', 'broadcasters'))
extractors.append(GenericComplexValueExtractor('P527', "seasons").)
extractors.append(StringValueExtractor("P1258", "rotten_tomatoes"))
extractors.append(StringValueExtractor("P345", "imdb"))
extractors.append(StringValueExtractor("P4983", "tmdb_id"))
extractors.append(LogoImageExtractor())
extractors.append(QuantityValueExtractor('P1113', 'number_episodes'))
extractors.append(QuantityValueExtractor('P2437', 'numbers_seasons'))
extractors.append(DateValueExtractor('P577', 'publication_date'))
extractors.append(DateValueExtractor('P580', 'start_time'))
extractors.append(DateValueExtractor('P582', 'end_time'))
extractors.append(CastMemberExtractor())
# series = ['Q1079', 'Q3815']
dataset_creator = DatasetCreator()
result = dataset_creator.create_data_set(series, extractors)
# print(result)
import pickle
import json
print(json.dumps(result))
with open('TVSeries2.pkl', 'wb') as f:
pickle.dump(result, f)
\ No newline at end of file
This diff is collapsed.
File added
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment