Commit d8276ab5 authored by kristian.noullet's avatar kristian.noullet

Publishing Agnos Mini version

Publishing Agnos Mini version
parent 16261258
<?xml version="1.0" encoding="UTF-8"?>
<classpath>
<classpathentry kind="src" output="target/classes" path="src">
<attributes>
<attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8"/>
<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="output" path="target/classes"/>
</classpath>
# Auto detect text files and perform LF normalization
* text=auto
# Compiled class file
*.class
# Log file
*.log
# BlueJ files
*.ctxt
# Mobile Tools for Java (J2ME)
.mtj.tmp/
# Package Files #
*.jar
*.war
*.nar
*.ear
*.zip
*.tar.gz
*.rar
*.tmp
#Eclipse Settings
.settings
# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
hs_err_pid*
/bin/
/target/
<?xml version="1.0" encoding="UTF-8"?>
<projectDescription>
<name>Agnos_mini</name>
<comment></comment>
<projects>
</projects>
<buildSpec>
<buildCommand>
<name>org.eclipse.jdt.core.javabuilder</name>
<arguments>
</arguments>
</buildCommand>
<buildCommand>
<name>org.eclipse.m2e.core.maven2Builder</name>
<arguments>
</arguments>
</buildCommand>
</buildSpec>
<natures>
<nature>org.eclipse.m2e.core.maven2Nature</nature>
<nature>org.eclipse.jdt.core.javanature</nature>
</natures>
</projectDescription>
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
package launcher;
import org.json.JSONObject;
import api.JSONAPIAnnotator;
import structure.config.kg.EnumModelType;
public class LauncherJSONTest {
public static void main(String[] args) {
new LauncherJSONTest().run();
}
private void run() {
final JSONObject jsonObj = new JSONObject(
"{\"topk\":false,\"input\":\"\",\"kg\":\"DBP\",\"fuzzy\":false,\"mentiondetection\":false}");
final EnumModelType KG = EnumModelType.DBPEDIA_FULL;
final JSONAPIAnnotator annotator = new JSONAPIAnnotator(KG);
annotator.init();
String ret = annotator.annotateDocument(jsonObj);
System.out.println(ret);
}
}
package launcher;
import java.io.File;
import java.io.IOException;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import linking.candidategeneration.CandidateGeneratorMap;
import linking.disambiguation.Disambiguator;
import linking.mentiondetection.InputProcessor;
import linking.mentiondetection.exact.HashMapCaseInsensitive;
import linking.mentiondetection.exact.MentionDetectorMap;
import structure.config.constants.EnumEmbeddingMode;
import structure.config.constants.FilePaths;
import structure.config.kg.EnumModelType;
import structure.datatypes.Mention;
import structure.interfaces.CandidateGenerator;
import structure.interfaces.MentionDetector;
import structure.utils.MentionPossibilityLoader;
import structure.utils.MentionUtils;
import structure.utils.Stopwatch;
public class LauncherLinking {
public static void main(String[] args) {
try {
new LauncherLinking().run();
} catch (InterruptedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
private void run() throws InterruptedException, IOException {
final EnumModelType KG = EnumModelType.//
// WIKIDATA//
DBPEDIA_FULL//
;
final String input = "hello world";
System.out.println("Computing for :" + KG.name());
final HashMap<String, Collection<String>> surfaceFormLinks = getMentions(KG);
// Initialize Mention Detection w/ possible mentions
final MentionDetector md = new MentionDetectorMap(surfaceFormLinks, new InputProcessor(null));
// Initialize Candidate Generation w/ surface forms and candidates
final CandidateGenerator cg = new CandidateGeneratorMap(surfaceFormLinks);
// Initialize Disambiguator w/ according algorithms
final Disambiguator d = new Disambiguator(KG, EnumEmbeddingMode.LOCAL);
System.out.println("Finished loading structures - starting process");
Stopwatch.start(getClass().getName());
final Collection<Mention> mentions = md.detect(input);
System.out.println("Finished MD - starting CG");
cg.generate(mentions);
System.out.println("Finished CG - starting Disambiguation");
d.disambiguate(mentions);
System.out.println("Finished Disambiguation - starting displaying...");
System.out.println(
"Total Process Duration:" + Stopwatch.endDiffStart(getClass().getName())+" ms.");
MentionUtils.displayMentions(mentions);
}
private HashMap<String, Collection<String>> getMentions(final EnumModelType KG) throws IOException {
HashMap<String, Collection<String>> map;
final MentionPossibilityLoader mpl = new MentionPossibilityLoader(KG);
final Map<String, Collection<String>> tmpMap = mpl
.exec(new File(FilePaths.FILE_ENTITY_SURFACEFORM_LINKING.getPath(KG)));
map = new HashMapCaseInsensitive<Collection<String>>();
// Case-insensitive map implementation
for (Map.Entry<String, Collection<String>> e : tmpMap.entrySet()) {
map.put(e.getKey(), e.getValue());
}
return map;
}
}
package launcher;
import java.io.IOException;
import java.util.Collection;
import java.util.HashMap;
import linking.candidategeneration.CandidateGeneratorMap;
import linking.disambiguation.Disambiguator;
import linking.mentiondetection.InputProcessor;
import linking.mentiondetection.exact.MentionDetectorMap;
import structure.config.kg.EnumModelType;
import structure.datatypes.Mention;
import structure.interfaces.CandidateGenerator;
import structure.interfaces.MentionDetector;
import structure.utils.MentionUtils;
public class LauncherLinkingSample {
public static void main(String[] args) {
try {
new LauncherLinkingSample().run();
} catch (InterruptedException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
private void run() throws InterruptedException, IOException {
final EnumModelType KG = EnumModelType.DBPEDIA_FULL;
final String input = "";
final HashMap<String, Collection<String>> surfaceFormLinks = new HashMap<>();
final MentionDetector md = new MentionDetectorMap(surfaceFormLinks, new InputProcessor(null));
final CandidateGenerator cg = new CandidateGeneratorMap(surfaceFormLinks);
final Disambiguator d = new Disambiguator(KG);
final Collection<Mention> mentions = md.detect(input);
cg.generate(mentions);
d.disambiguate(mentions);
MentionUtils.displayMentions(mentions);
}
}
This diff is collapsed.
package linking.candidategeneration;
import java.util.Collection;
import java.util.List;
import java.util.Map;
import com.beust.jcommander.internal.Lists;
import structure.datatypes.Mention;
import structure.datatypes.PossibleAssignment;
import structure.interfaces.CandidateGenerator;
import structure.utils.Loggable;
/**
* A simple dictionary/lookup-table-like implementation of candidate generation
*
* @author Kristian Noullet
*
*/
public class CandidateGeneratorMap implements CandidateGenerator, Loggable {
private final Map<String, Collection<String>> linking;
private final boolean throwException = true;
public CandidateGeneratorMap(Map<String, Collection<String>> linking) {
this.linking = linking;
}
/**
* Generates all possible assignments for given mention
*
* @param mention Mention for which to generate candidates!
* @return set of possible assignments
*/
@Override
public List<PossibleAssignment> generate(Mention mention) {
final Collection<String> possibleEntities = this.linking.get(mention.getMention());
if (possibleEntities == null) {
getLogger().error("Could not find any such mention(" + mention.getMention() + ") o.o");
if (throwException) {
throw new RuntimeException("Could not find a mention(" + mention.getMention()
+ ") although it apparently was detected...");
} else {
return null;
}
}
final List<PossibleAssignment> ret = Lists.newArrayList();
for (String entity : possibleEntities) {
// System.out.println("Mention["+mention.getMention()+"]: "+entity);
ret.add(PossibleAssignment.createNew(entity, mention.getMention()));
}
return ret;
}
@Override
public void generate(Collection<Mention> mentions) {
for (Mention m : mentions) {
// Update possible assignments
m.updatePossibleAssignments(generate(m));
}
}
}
This diff is collapsed.
package linking.disambiguation;
import java.util.function.BiFunction;
import structure.interfaces.Scorer;
import structure.utils.Loggable;
/**
* Class determining how scores stemming from various scorer instances are
* combined into a single disambiguated score
*
* @author Kristian Noullet
*
* @param <T> what type the scorers are working with
*/
public class ScoreCombiner<T> implements Loggable {
public Number combine(final Number currScore, final Scorer<T> scorer, final T scorerParam) {
// Add all types of scorers here with the appropriate weights
final Number score = scorer.computeScore(scorerParam);
final Number weight = scorer.getWeight();
// Generally not needed, but PR unfortunately can have some extremely high
// values by comparison and as such requires some smoothing (e.g. through
// sqrt())
final BiFunction<Number, T, Number> func = scorer.getScoreModulationFunction();
final Number modulatedVal = func == null ? score : func.apply(score, scorerParam).doubleValue();
return add(currScore, weight.doubleValue() * modulatedVal.doubleValue());
// if (scorer instanceof PageRankScorer) {
// // Pretty much just sets the weight
// final Double prScore = Numbers.PAGERANK_WEIGHT.val.doubleValue()
// // Due to PR values varying highly, doing a square root of it to slightly
// // smoothen it out
// * Math.sqrt(scorer.computeScore(scorerParam).doubleValue());
// return add(currScore, prScore);
// } else if (scorer instanceof VicinityScorer) {
// final Double vicScore = Numbers.VICINITY_WEIGHT.val.doubleValue()
// * scorer.computeScore(scorerParam).doubleValue();
// return add(currScore, vicScore);
// } else {
// return add(currScore, weight.doubleValue() * score.doubleValue());
// }
}
/**
* Transforms both numbers to double and adds them together.<br>
* <b>If currScore is NULL, it is treated as 0.</b>
*
* @param currScore
* @param score
* @return
*/
private Number add(Number currScore, Number score) {
return currScore == null ? score.doubleValue() : currScore.doubleValue() + score.doubleValue();
}
}
package linking.disambiguation.scorers;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.function.BiFunction;
import org.apache.commons.lang3.tuple.MutablePair;
import org.apache.commons.lang3.tuple.Pair;
import com.github.jsonldjava.shaded.com.google.common.collect.Lists;
import linking.disambiguation.scorers.embedhelp.EntitySimilarityService;
import linking.disambiguation.scorers.pagerank.PageRankLoader;
import structure.config.constants.Comparators;
import structure.datatypes.Mention;
import structure.utils.MentionUtils;
/**
* This cluster item picker serves to continuously call HillClimbingPicker,
* while removing the weakest identified link in each iteration, until only 2
* (including) remain
*
* @author Kris
*
*/
public class ContinuousHillClimbingPicker extends HillClimbingPicker {
public ContinuousHillClimbingPicker(final BiFunction<Double, Double, Double> operation,
final EntitySimilarityService similarityService, final PageRankLoader pagerankLoader) {
super(operation, similarityService, pagerankLoader);
}
public ContinuousHillClimbingPicker(final EntitySimilarityService similarityService,
final PageRankLoader pagerankLoader) {
super(similarityService, pagerankLoader);
}
/**
* ContinuousChoices = overall choices <br>
* IterationChoices = choices for that specific iteration
*
*/
@Override
public List<String> combine() {
super.prune = false;
int iterationCounter = 0;
final List<Mention> copyContext = Lists.newArrayList(this.context);
// Sorts them for the sake of initialisation picking based on word order
Collections.sort(copyContext, Comparators.mentionOffsetComparator);
// Computing clusters outside, so we don't have to redo it every time
final Map<String, List<String>> clusters = computeClusters(copyContext);
// Remove entities that do not have an associated embedding
// & cluster if they are left w/o entity as a result of it
removeInvalidEmbeddings(clusters);
final Map<String, List<MutablePair<String, Double>>> continuousChoices = new HashMap<>();
while (copyContext.size() > 1 && clusters.size() > 1) {
// Do the picking logic
final Map<String, Pair<String, Double>> iterationChoices = super.pickItems(clusters);
// If no item has been picked, there is no need to continue... -> jump out
if (iterationChoices == null || iterationChoices.size() < 1) {
break;
}
try {
// Processes the choices and removes the worst 'cluster of candidates'
processIterationResults(continuousChoices, iterationChoices, clusters, copyContext);
} catch (IllegalArgumentException | NullPointerException exc) {
System.err.println("###########################################");
System.out.println("Clusters:" + displayMap(clusters));
System.err.println("###########################################");
System.out.println("Iteration Choices:" + displayMap(iterationChoices));
System.err.println("###########################################");
System.err.println("Copy context:" + copyContext);
System.err.println("###########################################");
System.err.println("Context: " + context);
System.err.println("###########################################");
throw exc;
}
System.out.println("Iteration(#" + iterationCounter++ + ") Choices:");
System.out.println(displayMap(iterationChoices));
}
// Now just get the best one for each surface form
final List<String> retList = Lists.newArrayList();
for (Entry<String, List<MutablePair<String, Double>>> entrySurfaceForm : continuousChoices.entrySet()) {
Double maxValue = Double.MIN_VALUE;
Pair<String, Double> maxPair = null;
String maxKey = null;
for (MutablePair<String, Double> pair : entrySurfaceForm.getValue()) {
if (pair.getValue() > maxValue) {
maxPair = pair;
maxValue = pair.getValue();
maxKey = pair.getKey();
}
}
if (maxKey != null) {
retList.add(maxKey);
}
}
getLogger().info("FINAL CHOICES[" + retList.size() + "]: " + retList);
return retList;
}
private void processIterationResults(Map<String, List<MutablePair<String, Double>>> continuousChoices,
Map<String, Pair<String, Double>> iterationChoices, Map<String, List<String>> clusters,
List<Mention> copyContext) {
// Go through our choices and see which ones to cut away for the next iteration
for (Map.Entry<String, Pair<String, Double>> iterationChoice : iterationChoices.entrySet()) {
final String key = iterationChoice.getKey();
List<MutablePair<String, Double>> continuousPairs = continuousChoices.get(key);
if (continuousPairs == null) {
continuousPairs = Lists.newArrayList();
continuousChoices.put(key, continuousPairs);
}
boolean found = false;
final Pair<String, Double> iterationChoicePair = iterationChoice.getValue();
for (MutablePair<String, Double> continuousPair : continuousPairs) {
if (continuousPair.getKey().equals(iterationChoicePair.getKey())) {
// Same entity = 'Collision' - so modify/update score accordingly
found = true;
// It's the same pair, so let's combine them!
final Double currentValue = continuousPair.getValue();
final Double newValue = computeNewValue(this.context.size() - clusters.size(), currentValue,
iterationChoicePair.getValue());
continuousPair.setValue(newValue);
}
}
if (!found) {
// TODO: Check if logic really holds as rn I'm not sure whether there really is
// exactly one pair here if it doesn't exist yet
//
// Not a collision, so just add it
continuousPairs.add(new MutablePair<String, Double>(iterationChoicePair.getLeft(),
initVal(iterationChoicePair.getRight())));
}
}
Double minValue = Double.MAX_VALUE;
Pair<String, Double> minPair = null;
String minKey = null;
// Find the entity-score pair for the worst surface form
for (Map.Entry<String, Pair<String, Double>> e : iterationChoices.entrySet()) {
final Pair<String, Double> currentPair = e.getValue();
final Double currentValue = currentPair.getRight();
if (currentValue <= minValue) {
minKey = e.getKey();
minPair = currentPair;
minValue = currentValue;
}
}
// Remove surface form with worst result (as it likely is noise)
clusters.remove(minKey);
MentionUtils.removeStringMention(minKey, copyContext);
}
private <T> String displayMap(Map<String, T> map) {
final int MAX_ITEMS = 10;
return displayMap(map, MAX_ITEMS);
}
private <T> String displayMap(Map<String, T> map, final int MAX_ITEMS) {
final StringBuilder retSB = new StringBuilder();
final StringBuilder sbSub = new StringBuilder();
final String NEWLINE = System.getProperty("line.separator");
for (Map.Entry<String, T> e : map.entrySet()) {
final T val = e.getValue();
// Reset the SB for the value item(s)
sbSub.setLength(0);
if (val instanceof Iterable) {
final StringBuilder sbSubSub = new StringBuilder();
final Iterator valIt = ((Iterable) val).iterator();
int iterCounter = 0;
if (!valIt.hasNext()) {
// Do nothing if there's nothing following...
} else {
Object o = valIt.next();
sbSubSub.append(o.toString());
iterCounter++;
while (valIt.hasNext()) {
o = valIt.next();
iterCounter++;
sbSubSub.append("\t;\t");
sbSubSub.append(o.toString());
if (iterCounter > MAX_ITEMS) {
break;
}
}
sbSub.append(sbSubSub.toString());
}
// val == null ? "<NULL>": val.subList(0, Math.min(val.size(), MAX_ITEMS + 1));
} else {
sbSub.append(val.toString());
}
retSB.append("Key[" + e.getKey() + "] " + sbSub.toString());
retSB.append(NEWLINE);
}
return retSB.toString();
}
private Double initVal(Double right) {
// return right;
return 1D;
}
/**
* Computes the new value based on the iteration that we are part of, as well as
* the previously existing value and the new value
*
* @param iterationNumber
* @param previousValue
* @param currentValue
* @return
*/
private Double computeNewValue(int iterationNumber, Double previousValue, Double currentValue) {
return previousValue + iterationNumber * currentValue;
// return previousValue + currentValue;
// return previousValue + 1;
}
}
package linking.disambiguation.scorers;
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.ObjectInputStream;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.BiFunction;