Commit 277742d3 authored by kristian.noullet's avatar kristian.noullet

Simplifications and feature comforts

Simplifications and feature comforts
parent 1892d382
# Agnos_mini
<h1>Quick Start Guide Steps</h1>
Agnos is a KG-agnostic entity linking framework, allowing for ease of extension and deployment.
The ease of extension refers both to various knowledge graphs, as well as alternative methods for mention detection, candidate generation, entity disambiguation, as well as pruning.
<br>
<h1>Quick Start Guide</h1>
<ol start="0">
<li>Clone Repository</li>
<li>
Run install/BuildFiletree.java - as the name implies, it simply creates the file tree to make it easier to place required files.
</li>
<li>
Add your desired RDF KG as a .NT file to the execution environment's directory under "<i>./default/resources/data/kg.nt</i>"
</li>
<br>
<li>
Run <i>launcher.LauncherInstallation.java</i>
Load KG into an RDF Store by defining the location of your RDF-based KG within <i>install.LauncherSetupTDB:KGpath</i> and running it for your defined KG (in <i>install.LauncherSetupTDB:KG</i>).
</li>
<li>
Setup complete!<br>
Run Launcher.java to run a simple entity linking pipeline.<br>
Mention detection and candidate generation may now be performed!<br>
For disambiguation, the default scoring mechanisms of PageRank and VicinityScorerDirectedSparseGraph are now ready to be used.
</li>
</ol>
<h1>Full Startup Guide</h1>
<ol start="0">
<li>Clone Repository</li>
<li>
......
......@@ -79,23 +79,32 @@ public class JSONAPIAnnotator implements Executable {
private final String outFilepath = "/vol2/kris/api_agnos.log";
public JSONAPIAnnotator() {
log("Constructor1");
for (EnumModelType KG : EnumModelType.values()) {
this.KGs.put(KG.name(), KG);
}
this.embeddingMode = EnumEmbeddingMode.DEFAULT;
// this(KG, EnumEmbeddingMode.DEFAULT.val);
this(EnumEmbeddingMode.DEFAULT.val, EnumModelType.values());
}
public JSONAPIAnnotator(final EnumModelType KG) {
this(KG, EnumEmbeddingMode.DEFAULT.val);
public JSONAPIAnnotator(final EnumModelType... KG) {
this(EnumEmbeddingMode.DEFAULT.val, KG);
}
public JSONAPIAnnotator(final EnumModelType KG, final EnumEmbeddingMode embeddingMode) {
log("Constructor2(" + KG.name() + ")");
this.KGs.put(KG.name(), KG);
public JSONAPIAnnotator(final EnumEmbeddingMode embeddingMode, final EnumModelType... KGs) {
if (KGs != null && KGs.length > 0) {
for (EnumModelType KG : KGs) {
log("Constructor2(" + KG.name() + ")");
addKG(KG);
}
}
this.embeddingMode = embeddingMode;
addKG("wd", EnumModelType.WIKIDATA);
addKG("dbp", EnumModelType.DBPEDIA_FULL);
}
private void addKG(EnumModelType KG) {
this.KGs.put(KG.name(), KG);
}
private void addKG(final String key, EnumModelType KG) {
this.KGs.remove(KG.name());
this.KGs.put(key, KG);
}
@Override
......@@ -136,7 +145,13 @@ public class JSONAPIAnnotator implements Executable {
// Initialise AssignmentChooser
Stopwatch.start(chooserWatch);
this.disambiguatorMap.put(KG.name(), new Disambiguator(KG, this.embeddingMode));
final Set<String> wantedResources = new HashSet<>();
for (Map.Entry<String, Collection<String>> e : map.entrySet()) {
wantedResources.addAll(e.getValue());
}
final Disambiguator disambiguator = new Disambiguator(KG, this.embeddingMode, wantedResources);
this.disambiguatorMap.put(KG.name(), disambiguator);
Stopwatch.endOutput(chooserWatch);
this.prunerMap.put(KG.name(), new ThresholdPruner(1.0d));
// Add that it was initialised
......@@ -164,6 +179,7 @@ public class JSONAPIAnnotator implements Executable {
} catch (IOException e) {
e.printStackTrace();
}
System.out.println(string);
}
/**
......@@ -346,9 +362,15 @@ public class JSONAPIAnnotator implements Executable {
EnumModelType KG = null;
try {
KG = EnumModelType.valueOf(chosenKG);
return KG;
} catch (IllegalArgumentException iae) {
KG = null;
}
if ((KG = this.KGs.get(chosenKG)) != null) {
return KG;
}
if (KG == null) {
for (EnumModelType kg : EnumModelType.values()) {
if (kg.findableName().toLowerCase().contains(chosenKG.toLowerCase())) {
......
......@@ -38,11 +38,14 @@ public class LauncherSetupTDB implements Loggable {
// DBPEDIA
// CRUNCHBASE2
// MAG
CRUNCHBASE;
// CRUNCHBASE
DEFAULT
;
// DBPEDIA_FULL
// WIKIDATA;
System.out.println("Setting up TDB for: " + KG.name());
final String KGpath = "";
System.out.println("Setting up TDB for KG[" + KG.name()+"]");
final String KGpath = FilePaths.FILE_KNOWLEDGE_GRAPH.getPath(KG);
//"";
// "/vol2/cb/crunchbase-201510/dumps/crunchbase-dump-201510.nt";//CB2015
//"/vol2/cb/crunchbase-201806/dumps/crunchbase-dump-2018-06_sanitized.nt";// CB2018
// "./cb2018-06/crunchbase-dump-2018-06.nt";//NORMALIZED_CB2
......@@ -94,9 +97,10 @@ public class LauncherSetupTDB implements Loggable {
// Choose for which KG to load it into the TDB
// final EnumModelType KG = EnumModelType.MAG;
for (EnumModelType KG : EnumModelType.values()) {
final String KGpath = FilePaths.FILE_EXTENDED_GRAPH.getPath(KG);
final String KGpath = FilePaths.FILE_KNOWLEDGE_GRAPH.getPath(KG);
// Read a line to make sure it is not an empty file we are trying to load
try (BufferedReader br = new BufferedReader(new FileReader(KGpath))) {
// Read first line to check whether it's an empty file!
if (br.readLine() == null) {
// Skip this file if it's empty
getLogger().info("Skipping " + KG.name() + " due to empty file.");
......@@ -118,7 +122,7 @@ public class LauncherSetupTDB implements Loggable {
* @param KG which graph it corresponds to
* @param KGpath where to load it from
*/
private void exec(EnumModelType KG, final String KGpath) {
public void exec(EnumModelType KG, final String KGpath) {
final String datasetPath = FilePaths.DATASET.getPath(KG);
// Non-empty file
final Dataset dataset = TDBFactory.createDataset(datasetPath);
......
......@@ -118,8 +118,10 @@ public abstract class LiteralEntityQuery {
}
// Output to query results
outputMainChannel(varName, value, itVars.hasNext(), writers.get(0));
// Output to query linking when appropriate
outputAlternateChannels(varName, value, itVars.hasNext(), writers.subList(1, writers.size()));
if (writers.size() > 1) {
// Output to query linking when appropriate
outputAlternateChannels(varName, value, itVars.hasNext(), writers.subList(1, writers.size()));
}
}
}
......
package launcher;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.List;
import org.apache.jena.query.Dataset;
import org.apache.jena.rdf.model.Model;
import org.apache.jena.tdb.TDBFactory;
import com.beust.jcommander.internal.Lists;
import install.LauncherSetupTDB;
import install.PageRankComputer;
import install.surfaceform.query.general.SFQuery;
import structure.config.constants.FilePaths;
import structure.config.kg.DefaultQuery;
import structure.config.kg.EnumModelType;
public class LauncherInstallation {
public static void main(String[] args) {
final EnumModelType KG = EnumModelType.DEFAULT;
final String KGpath = FilePaths.FILE_KNOWLEDGE_GRAPH.getPath(KG);
final String query = DefaultQuery.ALL_LABELS.query;
// Import KG into RDF Store (Jena TDB Store)
new LauncherSetupTDB().exec(KG, KGpath);
// Compute PageRank and output to FilePaths.FILE_PAGERANK.getPath(KG)
new PageRankComputer(KG).exec(new String[] { KGpath });
// Extract mentions
extractMentions(KG, query);
}
private static void extractMentions(EnumModelType KG, String query) {
// Execute default SF query on TDB to extract mentions
final Dataset dataset = TDBFactory.createDataset(FilePaths.DATASET.getPath(KG));
final Model model = dataset.getDefaultModel();
final String SFout = FilePaths.FILE_ENTITY_SURFACEFORM_LINKING.getPath(KG);
try (final BufferedWriter bwOut = new BufferedWriter(new FileWriter(new File(SFout)))) {
final List<BufferedWriter> writers = Lists.newArrayList();
writers.add(bwOut);
try {
new SFQuery(KG).execSelectQuery(query, model, writers);
} catch (IOException e) {
System.err.println("[ERROR] Could not query surface forms.");
e.printStackTrace();
}
writers.clear();
} catch (IOException e1) {
System.err.println("[ERROR] Issue with the BufferedWriter.");
e1.printStackTrace();
}
model.close();
dataset.close();
}
}
......@@ -3,16 +3,17 @@ package launcher;
import org.json.JSONObject;
import api.JSONAPIAnnotator;
import structure.config.kg.EnumModelType;
public class LauncherTestJSONAPI {
public static void main(String[] args) {
final JSONAPIAnnotator annotator = new JSONAPIAnnotator(EnumModelType.DBPEDIA_FULL);
final JSONAPIAnnotator annotator = new JSONAPIAnnotator();// EnumModelType.DBPEDIA_FULL);
annotator.init();
final String inString = "{mentiondetection: true, input: 'world war 2', topk: true}";
final String input = ", input: 'steve jobs and joan baez were a couple'";
final String inString = "{kg: 'wd', mentiondetection: true" + input + ", topk: true }";
;
final JSONObject jsonObj = new JSONObject(inString);
System.out.println(annotator.annotateDocument(jsonObj));
}
}
......@@ -56,10 +56,19 @@ public class Disambiguator implements Loggable {
* @throws IOException
*/
public Disambiguator(final EnumModelType KG) throws IOException {
this(KG, EnumEmbeddingMode.DEFAULT);
this(KG, (Set<String>) null);
}
public Disambiguator(final EnumModelType KG, final Set<String> wantedEntities) throws IOException {
this(KG, EnumEmbeddingMode.DEFAULT, wantedEntities);
}
public Disambiguator(final EnumModelType KG, final EnumEmbeddingMode embeddingMode) throws IOException {
this(KG, embeddingMode, null);
}
public Disambiguator(final EnumModelType KG, final EnumEmbeddingMode embeddingMode,
final Set<String> wantedEntities) throws IOException {
final CombineOperation combineOperation = CombineOperation.MAX_SIM;
// Pre-Scoring
......@@ -79,12 +88,13 @@ public class Disambiguator implements Loggable {
// }
// }
System.out.println("Added VicinityScorerDirectedSparseGraph");
addPostScorer(new VicinityScorerDirectedSparseGraph(KG));
final boolean doEmbeddings = false;
final boolean doEmbeddings = true;
if (doEmbeddings) {
this.similarityService = setupSimilarityService(KG, embeddingMode);
System.out.println("Added GraphWalkEmbeddingScorer " + "[" + KG.name() + "]");
this.similarityService = setupSimilarityService(KG, embeddingMode, wantedEntities);
addPostScorer(new GraphWalkEmbeddingScorer(new ContinuousHillClimbingPicker(
combineOperation.combineOperation, similarityService, pagerankLoader)));
} else {
......@@ -98,13 +108,13 @@ public class Disambiguator implements Loggable {
}
}
private EntitySimilarityService setupSimilarityService(EnumModelType KG, final EnumEmbeddingMode embeddingMode)
throws IOException {
private EntitySimilarityService setupSimilarityService(EnumModelType KG, final EnumEmbeddingMode embeddingMode,
final Set<String> wantedEntities) throws IOException {
final Map<String, List<Number>> entityEmbeddingsMap;
if (embeddingMode == EnumEmbeddingMode.LOCAL) {
entityEmbeddingsMap = GraphWalkEmbeddingScorer.humanload(
FilePaths.FILE_GRAPH_WALK_ID_MAPPING_ENTITY_HUMAN.getPath(KG),
FilePaths.FILE_EMBEDDINGS_GRAPH_WALK_ENTITY_EMBEDDINGS.getPath(KG));
FilePaths.FILE_EMBEDDINGS_GRAPH_WALK_ENTITY_EMBEDDINGS.getPath(KG), wantedEntities);
return new EntitySimilarityService(entityEmbeddingsMap);
} else {
return new EntitySimilarityService();
......
......@@ -49,6 +49,7 @@ public class ContinuousHillClimbingPicker extends HillClimbingPicker {
super.prune = false;
int iterationCounter = 0;
final List<Mention> copyContext = Lists.newArrayList(this.context);
System.out.println(getClass().getName()+" - Mentions["+copyContext+"]");
// Sorts them for the sake of initialisation picking based on word order
Collections.sort(copyContext, Comparators.mentionOffsetComparator);
// Computing clusters outside, so we don't have to redo it every time
......@@ -60,6 +61,7 @@ public class ContinuousHillClimbingPicker extends HillClimbingPicker {
final Map<String, List<MutablePair<String, Double>>> continuousChoices = new HashMap<>();
while (copyContext.size() > 1 && clusters.size() > 1) {
// Do the picking logic
System.out.println("Displaying (valid) clusters: "+displayMap(clusters, 10));
final Map<String, Pair<String, Double>> iterationChoices = super.pickItems(clusters);
// If no item has been picked, there is no need to continue... -> jump out
......
......@@ -50,12 +50,17 @@ public class GraphWalkEmbeddingScorer implements PostScorer<PossibleAssignment,
*/
public static Map<String, List<Number>> humanload(final String mappingInPath, final String embeddingInPath)
throws IOException {
return humanload(mappingInPath, embeddingInPath, null);
}
public static Map<String, List<Number>> humanload(final String mappingInPath, final String embeddingInPath,
final Set<String> wantedEntities) throws IOException {
IDMappingLoader<String> entityMapping = new IDMappingLoader<String>().loadHumanFile(new File(mappingInPath));
final File embedFile = new File(embeddingInPath);
log().info("Loading embeddings from: " + embedFile.getAbsolutePath());
Stopwatch.start(GraphWalkEmbeddingScorer.class.getName());
final Map<String, List<Number>> entityEmbeddingsMap = EmbeddingsUtils.readEmbeddings(embedFile, entityMapping,
true);
true, wantedEntities);
log().info("Finished(" + Stopwatch.endOutput(GraphWalkEmbeddingScorer.class.getName())
+ " ms.) loading embeddings from: " + embedFile.getAbsolutePath());
entityMapping = null;
......
......@@ -132,6 +132,7 @@ public enum FilePaths {
FILE_NT_ENTITIES(DIR_DATA.path + "entities.nt", "NT File containing all entities"), //
// "Corrected" RDF Graph files
FILE_KNOWLEDGE_GRAPH(DIR_DATA.path + "kg.nt"), //
FILE_GRAPH_RDF(DIR_DATA.path + "rdf.ttl"), //
FILE_GRAPH_RDF_TYPES(DIR_DATA.path + "types.ttl"), //
// Extended TXT graph file
......
package structure.config.kg;
/**
* Constant (SPARQL-based) query definitions used by the EnumModelType
* enumeration for specific knowledge graphs to retrieve entities
*
* @author Kristian Noullet
*
*/
public enum DefaultQuery {
// DBPEDIA(""), //
// FREEBASE(""), //
// CRUNCHBASE(""), //
// Note that the entity variable HAS to be ?s for RDF2Vec
ALL_LITERALS("SELECT DISTINCT ?s ?o WHERE { ?s ?p ?o . FILTER(isLiteral(?o))}"), //
ALL_LABELS("SELECT DISTINCT ?s ?o WHERE { ?s <http://www.w3.org/2000/01/rdf-schema#label> ?o }"), //
;
public final String query;
DefaultQuery(final String entityQuery) {
this.query = entityQuery;
}
}
......@@ -13,14 +13,20 @@ public enum EnumModelType {
//DBPEDIA(Strings.ROOTPATH.val + "dbpedia/", EntityQuery.DEFAULT), //
DBPEDIA_FULL(Strings.ROOTPATH.val + "dbpedia_full/", EntityQuery.DEFAULT, EnumConnection.SHETLAND_VIRTUOSO, false,
"http://dbpedia.org"), //
//FREEBASE(Strings.ROOTPATH.val + "freebase/", EntityQuery.DEFAULT), //
CRUNCHBASE(Strings.ROOTPATH.val + "crunchbase2018/", EntityQuery.CRUNCHBASE2), //
//CRUNCHBASE(Strings.ROOTPATH.val + "crunchbase2018/", EntityQuery.CRUNCHBASE2), //
//CRUNCHBASE2(Strings.ROOTPATH.val + "crunchbase2015/", EntityQuery.CRUNCHBASE2), //
//MINI_MAG(Strings.ROOTPATH.val + "mini_mag/", EntityQuery.MAG), //
//MAG(Strings.ROOTPATH.val + "mag/", EntityQuery.MAG), //
//DBLP(Strings.ROOTPATH.val + "dblp/", EntityQuery.DBLP), //
WIKIDATA(Strings.ROOTPATH.val + "wikidata/", EntityQuery.WIKIDATA),//
DEFAULT(Strings.ROOTPATH.val + "/", EntityQuery.DEFAULT) //
DEFAULT(Strings.ROOTPATH.val + "default/", EntityQuery.DEFAULT) //
;
public final String root;
public final EntityQuery query;
......
......@@ -10,6 +10,7 @@ import java.io.IOException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.BiFunction;
import org.apache.log4j.Logger;
......@@ -31,17 +32,17 @@ public class EmbeddingsUtils {
*/
public static Map<String, List<Number>> readEmbeddings(final File intputFile)
throws FileNotFoundException, IOException {
return readEmbeddings(intputFile, null, true);
return readEmbeddings(intputFile, null, true, null);
}
public static Map<String, List<Number>> readEmbeddings(final File intputFile,
final IDMappingLoader<String> mappingLoader, final boolean normalize)
final IDMappingLoader<String> mappingLoader, final boolean normalize, final Set<String> wantedEntities)
throws FileNotFoundException, IOException {
final String delim = Strings.EMBEDDINGS_TRAINED_DELIM.val;
// final boolean stripArrows = false;
// final String delim = " ";
final boolean stripArrows = true;
return readEmbeddings(intputFile, mappingLoader, normalize, delim, stripArrows);
return readEmbeddings(intputFile, mappingLoader, normalize, delim, stripArrows, wantedEntities);
}
/**
......@@ -50,11 +51,14 @@ public class EmbeddingsUtils {
* <b>Note</b>: If a vocabulary word appears multiple times, the latter will
* replace the existing one
*
* @param intputFile input file containing embeddings
* @param mappingLoader NULLABLE; used IDMappingLoader (if applicable)
* @param normalize whether to normalize the embeddings vectors
* @param delim what delimiter was used to output the embeddings
* @param stripArrows whether to strip arrows from the entity
* @param intputFile input file containing embeddings
* @param mappingLoader NULLABLE; used IDMappingLoader (if applicable)
* @param normalize whether to normalize the embeddings vectors
* @param delim what delimiter was used to output the embeddings
* @param stripArrows whether to strip arrows from the entity
* @param wantedEntities NULLABLE; allows for lazy-loading of embeddings (useful
* for particularly large embeddings files and when
* changes to MD are being done)
* @return populated embeddings map
* @throws FileNotFoundException if file was not found
* @throws IOException if any IO exception happens, most likely due to
......@@ -62,15 +66,17 @@ public class EmbeddingsUtils {
*/
public static Map<String, List<Number>> readEmbeddings(final File intputFile,
final IDMappingLoader<String> mappingLoader, final boolean normalize, final String delim,
final boolean stripArrows) throws FileNotFoundException, IOException {
final boolean stripArrows, final Set<String> wantedEntities) throws FileNotFoundException, IOException {
System.out.println("Wanted Entities: " + (wantedEntities == null ? "null" : wantedEntities.size()));
// Embeddings format: vocabularyWord <delim> List<Double>
final Map<String, List<Number>> embeddings = new HashMap<>();
int lineCounter = 0;
int lineCounter = 0, loadedCounter = 0;
String line = null;
try (final BufferedReader brIn = new BufferedReader(new FileReader(intputFile))) {
while ((line = brIn.readLine()) != null) {
if (lineCounter % 100_000 == 0) {
System.out.println("# of embeddings: " + lineCounter);
System.out
.println("# of embeddings: Loaded[" + loadedCounter + "] / Traversed[" + lineCounter + "]");
System.out.println("Current: " + line.substring(0, 100));
}
lineCounter++;
......@@ -78,6 +84,7 @@ public class EmbeddingsUtils {
// Word \t 1.23123 \t 2.1421421 ...
final String[] tokens = line.split(delim);
String vocab = tokens[0];
// If it's an ID and needs to translate to a resource
if (mappingLoader != null && !mappingLoader.isEmpty()) {
final String associatedWord = mappingLoader.getMapping(vocab);
if (associatedWord != null) {
......@@ -85,6 +92,15 @@ public class EmbeddingsUtils {
}
}
// Lazy loading component - only load it if it can be detected through mentions
if (wantedEntities != null && wantedEntities.size() > 0) {
if (!wantedEntities.contains(vocab)) {
// Not within our set, so skip it
continue;
}
}
// Strips < and > from token
if (stripArrows) {
final int endOffset = vocab.length() - 1;
if ((vocab.charAt(0) == '<') && (vocab.charAt(endOffset) == '>') && endOffset > 1) {
......@@ -107,6 +123,7 @@ public class EmbeddingsUtils {
embedding = normalize(embedding);
}
embeddings.put(vocab, embedding);
loadedCounter++;
embedding = null;
}
}
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment