Commit 13d62075 authored by ufzfq's avatar ufzfq
Browse files

added manual, fixed some errors

parent 91a1d9f2
Hinsweise zur Nutzung des Skriptes:
Mit dem Skript kann man für eine bestimmt Domäne einen Datensatz auf Basis von Wikidata erstellen.
Als input nimmt der DatasetCreator zum einen eine Liste aus Entities (QXXXX) die zu einer Domäne gehören (z.B. Wikidata SPARQL-Query, die einem alle TVSerien zurückliefert).
Der zweite Inputparameter ist eine Liste von PropertyExtractor.
Hierbei handelt es sich um die Properties, die für jede der Entities abgefragt werden sollen.
Da sich die Strukturen der einzelnen Properties in Wikidata unterscheiden, müssen für verschiedene Properties leider spezielle Extractors erstellt werden, die definieren wie die Daten konkret extrahiert werden sollen.
Der GenericComplexValueExtractor kann beispielsweise genutzt werden, um die Werte für Properties zu extrahieren, die auf weitere Entities verweisen, der DateValueExtractor für alle Properties, die auf Daten verweisen etc.
import json
# def extract_id_from_resource_uri(resource_uri):
# resource_uri = entry['series']
# q_index = resource_uri.find('Q')
def extract_id_from_resource_uri(resource_uri):
resource_uri = entry['series']
q_index = resource_uri.find('Q')
# return resource_uri[q_index:]
# series = []
# with open('TVSeries.json', encoding="utf8") as json_file:
# data = json.load(json_file)
# for entry in data:
# series_uri = entry['series']
# series.append(extract_id_from_resource_uri(series_uri))
return resource_uri[q_index:]
series = []
with open('TVSeries.json', encoding="utf8") as json_file:
data = json.load(json_file)
for entry in data:
series_uri = entry['series']
series.append(extract_id_from_resource_uri(series_uri))
import requests
......@@ -298,11 +298,11 @@ class WBGetEntitiesBuilder:
class DatasetCreator:
def split_list_into_chunks(l, chunk_size):
def split_list_into_chunks(self, l, chunk_size):
for i in range(0, len(l), chunk_size):
yield l[i:i + chunk_size]
def extract_properties_for_entity(claims, entity, extractors):
def extract_properties_for_entity(self, claims, entity, extractors):
result = []
for extractor in extractors:
extractor_result = extractor.extract(claims)
......@@ -311,11 +311,11 @@ class DatasetCreator:
return result
def extract_properties_for_entities(data_json, chunk, extractors):
def extract_properties_for_entities(self, data_json, chunk, extractors):
result = {}
for entity in chunk:
claims_for_entity = data_json[entity]['claims']
properties = extract_properties_for_entity(claims_for_entity, entity, extractors)
properties = self.extract_properties_for_entity(claims_for_entity, entity, extractors)
chunk_properties = {"entity_id" : entity, "label" : "", "properties" : properties}
result[entity] = {"properties" : properties}
......@@ -323,7 +323,7 @@ class DatasetCreator:
def remember_property_values(property_value_maps, all_prop_values, property_to_extractor_map):
def remember_property_values(self, property_value_maps, all_prop_values, property_to_extractor_map):
for entity in property_value_maps:
props = property_value_maps[entity]['properties']
for prop in props:
......@@ -331,12 +331,12 @@ class DatasetCreator:
extractor = property_to_extractor_map[prop_name]
extractor.remember_property_vals(all_prop_values, prop['data'])
def build_prop_val_to_label_map(all_property_values, query_builder):
def build_prop_val_to_label_map(self, all_property_values, query_builder):
props_list = list(all_property_values)
chunks = split_list_into_chunks(props_list, 50)
chunks = self.split_list_into_chunks(props_list, 50)
result = {}
for chunk in tqdm(chunks):
flattened_chunk = flatten_chunk(chunk)
flattened_chunk = self.flatten_chunk(chunk)
query = query_builder.build_entites_labels(flattened_chunk)
print(query)
data = requests.get(query)
......@@ -356,7 +356,7 @@ class DatasetCreator:
return result
def update_prop_vals_in_result(entity_result, prop_vals_to_labels, property_to_extractor_map):
def update_prop_vals_in_result(self, entity_result, prop_vals_to_labels, property_to_extractor_map):
for entity in entity_result:
props = entity_result[entity]['properties']
updated_prop_vals = []
......@@ -369,36 +369,36 @@ class DatasetCreator:
entity_result[entity]['properties'] = updated_prop_vals
entity_result[entity]['label'] = prop_vals_to_labels[entity]
def build_property_to_extractor_map(extractors):
def build_property_to_extractor_map(self, extractors):
prop_to_ext = {}
for extractor in extractors:
prop_to_ext[extractor.get_prop_name()] = extractor
return prop_to_ext
def flatten_chunk(chunk):
def flatten_chunk(self, chunk):
result = []
for c in chunk:
result.append(c)
return result
def create_data_set(entities, extractors):
def create_data_set(self, entities, extractors):
"""create a dataset from wikidata for a given domain and specified properties
Args:
entities (list): QXXXX codes of wikidata entities of a common domain
extractors (list): PropertyExtractors defined for the properties that shall be extracted for each entity
"""
chunked_list = split_list_into_chunks(entities, 10)
chunked_list = self.split_list_into_chunks(entities, 10)
query_builder = WBGetEntitiesBuilder()
property_to_extractor_map = build_property_to_extractor_map(extractors)
property_to_extractor_map = self.build_property_to_extractor_map(extractors)
all_property_values = set()
entity_results = {}
for chunk in tqdm(chunked_list):
flattened_chunk = flatten_chunk(chunk)
flattened_chunk = self.flatten_chunk(chunk)
#print(flattened_chunk)
query = query_builder.build_entities_with_properties(list(flattened_chunk))
......@@ -407,20 +407,20 @@ class DatasetCreator:
# print(data_json)
#List of {'entity_id': entity_id, 'properties' : [property : [property_values]]}}
properties_for_entities = extract_properties_for_entities(data_json['entities'], flattened_chunk, extractors)
properties_for_entities = self.extract_properties_for_entities(data_json['entities'], flattened_chunk, extractors)
# for p in properties_for_entities:
# entity_results.append(p)
entity_results.update(properties_for_entities)
time.sleep(3)
remember_property_values(entity_results, all_property_values, property_to_extractor_map)
self.remember_property_values(entity_results, all_property_values, property_to_extractor_map)
for entity in entities:
all_property_values.add(entity)
prop_vals_to_labels = build_prop_val_to_label_map(all_property_values, query_builder)
prop_vals_to_labels = self.build_prop_val_to_label_map(all_property_values, query_builder)
update_prop_vals_in_result(entity_results, prop_vals_to_labels, property_to_extractor_map)
self.update_prop_vals_in_result(entity_results, prop_vals_to_labels, property_to_extractor_map)
......@@ -437,7 +437,7 @@ extractors.append(GenericComplexValueExtractor('P57', 'directors'))
extractors.append(GenericComplexValueExtractor('P86', 'composers'))
extractors.append(GenericComplexValueExtractor('P344', 'directors_of_photography'))
extractors.append(GenericComplexValueExtractor('P449', 'broadcasters'))
extractors.append(GenericComplexValueExtractor('P527', "seasons").)
extractors.append(GenericComplexValueExtractor('P527', "seasons"))
extractors.append(StringValueExtractor("P1258", "rotten_tomatoes"))
extractors.append(StringValueExtractor("P345", "imdb"))
......@@ -461,5 +461,5 @@ import pickle
import json
print(json.dumps(result))
with open('TVSeries2.pkl', 'wb') as f:
with open('TVSeries3.pkl', 'wb') as f:
pickle.dump(result, f)
\ No newline at end of file
File added
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment