Commit 1892d382 authored by kristian.noullet's avatar kristian.noullet

Improved ease of installation

Improved ease of installation
parent c728c89d
......@@ -68,8 +68,8 @@ Post-configuration, you may run Agnos by executing <i>launcher.LauncherLinking</
<h1>Mention Detection</h1>
<br>Out-of-the-box Agnos provides users with 2 main mention detection mechanisms:
<ul>
<li><i>linking.mentiondetection.exact.MentionDetectorMap</i> (Exact matching)</li>
<li><i>linking.mentiondetection.fuzzy.MentionDetectorLSH</i> (Fuzzy matching)</li>
<li><i><a href="https://git.scc.kit.edu/wf7467/agnos_mini/-/tree/master/src/linking/mentiondetection/exact/MentionDetectorMap.java">linking.mentiondetection.exact.MentionDetectorMap</a></i> (Exact matching)</li>
<li><i><a href="https://git.scc.kit.edu/wf7467/agnos_mini/-/tree/master/src/linking/mentiondetection/fuzzy/MentionDetectorLSH.java">linking.mentiondetection.fuzzy.MentionDetectorLSH</a></i> (Fuzzy matching)</li>
</ul>
<br>Former performs mention detection by checking whether a possible input is contained within a passed map instance.
<br>Latter utilizes locality-sensitive hashing techniques (MinHash), allowing detection with a user-defined grade of fuzziness.
......
This diff is collapsed.
This diff is collapsed.
package install;
import structure.config.kg.EnumModelType;
public class InstallFiletree {
public static void main(String[] args) {
System.out.print("Building file tree...");
// Makes a simple call to the enum values to initiate the classloader to do its
// dependency-magic
EnumModelType.DEFAULT.values();
System.out.print("Done!");
}
}
package install;
import org.apache.log4j.Logger;
import install.surfaceform.query.HSFQueryExecutor;
import install.surfaceform.query.NP_HSFQueryExecutor;
import install.surfaceform.query.NP_URLHSFQueryExecutor;
import install.surfaceform.query.SFQueryExecutor;
import structure.Pipeline;
import structure.config.kg.EnumModelType;
/**
* Executes all queries from defined folders (for SF, HSF, NP HSF, NP URL HSF)
* and saves the output to the respective output folders for the specific type
*
* @author Kristian Noullet
*
*/
public class LauncherExecuteQueries {
public static void main(String[] args) {
Pipeline pipeline = new Pipeline();
final EnumModelType KG = EnumModelType.DBPEDIA_FULL;
final String execMsg = "Executing queries for KG(" + KG.name() + ") - (SF, HSF, NP_HSF, NP_URLHSF)";
Logger.getLogger(LauncherExecuteQueries.class)
.info(execMsg);
System.out.println(execMsg);
pipeline.queue(new SFQueryExecutor(), KG);
pipeline.queue(new HSFQueryExecutor(), KG);
pipeline.queue(new NP_HSFQueryExecutor(), KG);
pipeline.queue(new NP_URLHSFQueryExecutor(), KG);
try {
pipeline.exec();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
package install;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.List;
import org.apache.jena.query.Dataset;
import org.apache.jena.query.ReadWrite;
import org.apache.jena.rdf.model.Model;
import org.apache.jena.tdb.TDBFactory;
import org.apache.jena.tdb.TDBLoader;
import com.google.common.collect.Lists;
import structure.config.constants.FilePaths;
import structure.config.kg.EnumModelType;
import structure.utils.Loggable;
/**
* Load a specified knowledge base into a Jena dataset that we can query
* henceforth
*
* @author Kris
*
*/
public class LauncherSetupTDB implements Loggable {
private static List<String> abortedList = Lists.newArrayList();
public static void main(String[] args) {
// new LauncherSetupTDB().exec();
// Load the knowledge graph into the appropriate dataset
final EnumModelType KG = EnumModelType.
// MINI_MAG
// DBLP
// DBPEDIA
// CRUNCHBASE2
// MAG
CRUNCHBASE;
// DBPEDIA_FULL
// WIKIDATA;
System.out.println("Setting up TDB for: " + KG.name());
final String KGpath = "";
// "/vol2/cb/crunchbase-201510/dumps/crunchbase-dump-201510.nt";//CB2015
//"/vol2/cb/crunchbase-201806/dumps/crunchbase-dump-2018-06_sanitized.nt";// CB2018
// "./cb2018-06/crunchbase-dump-2018-06.nt";//NORMALIZED_CB2
// "/home/faerberm/inRDF-URI-as-obj/";// MAG
// "/vol2/kris/PaperReferences_o.nt";// MAG PaperReferences_o.nt
// "/vol2/dblp/dumps/dblp_2018-11-02_unique.nt";//DBLP
// "./dblp_kg/dblp_2018-11-02_unique.nt";//NORMALIZED_DBLP
// "/vol1/mag/data/2018-07-19/MAGFieldsOfStudyKG/MAGFieldsOfStudyKG.nt";//Mini-MAG
// "./crunchbase-dump-2018-06_normalized.nt";// normalized CB2
// "./dblp_2018-11-02_unique_normalized.nt";// normalized DBLP
// "/vol1/data_faerberm/kris/data_dbpedia_extracted";// DBpedia
// "/home/noulletk/prog/bmw/dbpedia/resources/data/datasets/extracted/";//
// DBpedia
// "/home/noulletk/prog/bmw/input_dbpedia/";// DBpedia
// Handle appropriately both for input file (just load it)
// and input directory (get all files within it, aka. ignore subdirectories)
//"/vol2/wikidata/dumps/20190213/wikidata-20190213-truthy-BETA_all_URI-obj.nt";
final File inFile = new File(KGpath);
final List<String> inFiles = Lists.newArrayList();
if (inFile.isDirectory()) {
// Just takes files from the first level, does NOT go deeper if a directory is
// contained within specified directory
for (File f : inFile.listFiles()) {
if (f.isFile()) {
inFiles.add(f.getAbsolutePath());
}
}
} else {
inFiles.add(inFile.getAbsolutePath());
}
// Execute the loading part...
for (String kgInPath : inFiles) {
System.out.println("Source(" + (inFiles.indexOf(kgInPath) + 1) + "/" + inFiles.size() + "): " + kgInPath);
new LauncherSetupTDB().exec(KG, kgInPath);
System.out.println("Aborted (" + abortedList.size() + "): " + abortedList);
}
System.out.println("Aborted files(" + abortedList.size() + "): " + abortedList);
// Set up for other
}
/**
* Loads all KGs from FilePaths.FILE_EXTENDED_GRAPH into their respective
* datasets
*
*/
private void exec() {
// Choose for which KG to load it into the TDB
// final EnumModelType KG = EnumModelType.MAG;
for (EnumModelType KG : EnumModelType.values()) {
final String KGpath = FilePaths.FILE_EXTENDED_GRAPH.getPath(KG);
// Read a line to make sure it is not an empty file we are trying to load
try (BufferedReader br = new BufferedReader(new FileReader(KGpath))) {
if (br.readLine() == null) {
// Skip this file if it's empty
getLogger().info("Skipping " + KG.name() + " due to empty file.");
continue;
} else {
// Process file if it's not empty
getLogger().info("Loading " + KG.name());
exec(KG, KGpath);
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
/**
* Loads a single KG into the appropriate dataset
*
* @param KG which graph it corresponds to
* @param KGpath where to load it from
*/
private void exec(EnumModelType KG, final String KGpath) {
final String datasetPath = FilePaths.DATASET.getPath(KG);
// Non-empty file
final Dataset dataset = TDBFactory.createDataset(datasetPath);
dataset.begin(ReadWrite.READ);
// Get model inside the transaction
Model model = dataset.getDefaultModel();
dataset.end();
// Now load it all into the Model
dataset.begin(ReadWrite.WRITE);
model = dataset.getDefaultModel();
try {
TDBLoader.loadModel(model, KGpath, true);
// model.commit();
dataset.commit();
} catch (Exception e) {
System.out.println("Aborted: " + KGpath);
abortedList.add(KGpath);
// model.abort();
dataset.abort();
} finally {
dataset.end();
}
}
}
package install;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import com.beust.jcommander.internal.Lists;
import install.pr.PageRankRDF;
import structure.config.constants.FilePaths;
import structure.config.kg.EnumModelType;
import structure.interfaces.Executable;
/**
* Computes apriori PageRank scores for a given KG
*
* @author Kristian Noullet
*
*/
public class PageRankComputer implements Executable {
final EnumModelType KG;
final boolean caseSensitive;
public PageRankComputer(final EnumModelType KG) {
this(KG, true);
}
public PageRankComputer(final EnumModelType KG, final boolean caseSensitive) {
this.KG = KG;
this.caseSensitive = caseSensitive;
}
@Override
public void init() {
}
@Override
public boolean reset() {
return false;
}
private void pagerank(final Collection<String> inPaths) throws IOException {
// String dump = ".\\resources\\data\\extended_graph\\literal_address_city.txt";
// String in = FilePaths.TEST_FILE_PAGERANKRDF_EXAMPLE3_IN.path;
// String out = FilePaths.TEST_FILE_PAGERANKRDF_EXAMPLE3_OUT.path;
// final String in = FilePaths.FILE_EXTENDED_GRAPH.getPath(KG);
final String out = FilePaths.FILE_PAGERANK.getPath(KG);
final List<String> inFiles = Lists.newArrayList();
for (String inPath : inPaths) {
final File inFile = new File(inPath);
if (inFile.isFile()) {
inFiles.add(inFile.getAbsolutePath());
} else if (inFile.isDirectory()) {
for (File f : inFile.listFiles()) {
if (f.isFile()) {
inFiles.add(f.getAbsolutePath());
}
}
}
}
final PageRankRDF pageRankRDF = new PageRankRDF(inFiles, 0.85, 1.0, 50, false, this.caseSensitive);
pageRankRDF.compute();
try (PrintWriter wrt = new PrintWriter(new BufferedWriter(new FileWriter(new File(out))))) {
pageRankRDF.printPageRankScoresRDF(wrt);
}
// List<PageRankScore> scores = pageRankRDF.getPageRankScores();
// for (PageRankScore score : scores) {
// System.out.println(score.node + " - " + score.pageRank);
// }
}
@Override
public <T> T exec(Object... o) {
try {
if (o instanceof String[]) {
pagerank(Arrays.asList((String[]) o));
}
final List<String> files = Lists.newArrayList();
for (Object obj : o) {
if (obj instanceof String) {
files.add((String) obj);
} else if (obj instanceof String[]) {
files.addAll(Arrays.asList((String[]) obj));
} else if (obj instanceof Collection) {
for (Object subObj : ((Collection) obj)) {
if (subObj instanceof String) {
files.add(((String) subObj));
}
}
}
}
getLogger().info("PR w/ " + files);
pagerank(files);
} catch (IOException e) {
e.printStackTrace();
}
return null;
}
@Override
public boolean destroy() {
return false;
}
@Override
public String getExecMethod() {
return "pagerank";
}
}
package install.pr;
import java.io.PrintWriter;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Set;
import org.apache.jena.graph.Triple;
import org.apache.jena.riot.lang.PipedRDFIterator;
import com.beust.jcommander.internal.Lists;
import structure.datatypes.pr.PageRankScore;
import structure.utils.Loggable;
/**
*
* @author WDAqua (https://github.com/WDAqua/PageRankRDF)
* Modified by: wf7467
*
*/
public class PageRankRDF implements Loggable {
public static Double DEFAULT_DAMPING = 0.85D;
public static Double DEFAULT_START_VALUE = 0.1D;
public static int DEFAULT_ITERATION = 40;
public static boolean DEFAULT_LITERALS = false;
public static boolean DEFAULT_CASE_SENSITIVE = true;
private double dampingFactor = DEFAULT_DAMPING;
private double startValue = DEFAULT_START_VALUE;
private int numberOfIterations = DEFAULT_ITERATION;
private Collection<String> dumps;
private HashMap<String, Double> pageRankScores = new HashMap<>();
private boolean literals = DEFAULT_LITERALS;
private boolean caseSensitive = DEFAULT_CASE_SENSITIVE;
public PageRankRDF(String dump) {
this.dumps = Lists.newArrayList(dump);
}
public PageRankRDF(String dump, double dampingFactor, double startValue, int numberOfIterations, boolean literals,
boolean caseSensitive) {
this(Lists.newArrayList(dump), dampingFactor, startValue, numberOfIterations, literals, caseSensitive);
}
public PageRankRDF(String dump, double dampingFactor, double startValue, int numberOfIterations,
boolean caseSensitive) {
this(Lists.newArrayList(dump), dampingFactor, startValue, numberOfIterations, false, caseSensitive);
}
public PageRankRDF(String[] dumps, double dampingFactor, double startValue, int numberOfIterations,
boolean literals, boolean caseSensitive) {
this(Arrays.asList(dumps), dampingFactor, startValue, numberOfIterations, literals, caseSensitive);
}
public PageRankRDF(Collection<String> dumps, double dampingFactor, double startValue, int numberOfIterations,
boolean literals, boolean caseSensitive) {
this.dumps = dumps;
this.dampingFactor = dampingFactor;
this.startValue = startValue;
this.numberOfIterations = numberOfIterations;
this.literals = literals;
this.caseSensitive = caseSensitive;
}
public void compute() {
getLogger().info("Computing pagerank for dumps:" + dumps);
// Compute the number of outgoing edges
final HashMap<String, Integer> numberOutgoing = new HashMap<>();
final HashMap<String, ArrayList<String>> incomingPerPage = new HashMap<String, ArrayList<String>>();
long time = System.currentTimeMillis();
final long initTime = time;
for (String dump : this.dumps) {
System.err.println("Processing " + dump);
PipedRDFIterator<Triple> iter = Parser.parse(dump);
while (iter.hasNext()) {
Triple t = iter.next();
if (literals || t.getObject().isURI()) {
ArrayList<String> incoming = incomingPerPage.get(t.getObject().toString());
if (incoming == null) {
incoming = new ArrayList<>();
incomingPerPage.put(t.getObject().toString(), incoming);
}
ArrayList<String> incoming2 = incomingPerPage.get(t.getSubject().toString());
if (incoming2 == null) {
incomingPerPage.put(t.getSubject().toString(), new ArrayList<>());
}
incoming.add(t.getSubject().toString());
Integer numberOut = numberOutgoing.get(t.getSubject().toString());
if (numberOut == null) {
numberOutgoing.put(t.getSubject().toString(), Integer.valueOf(1));
} else {
numberOutgoing.put(t.getSubject().toString(), Integer.valueOf(numberOut.intValue() + 1));
}
}
}
iter.close();
System.err.println("Reading input(" + dump + ") took " + (System.currentTimeMillis() - time) / 1000L + "s");
time = System.currentTimeMillis();
}
System.err.println("Computing PageRank: " + numberOfIterations + " iterations, damping factor " + dampingFactor
+ ", start value " + startValue + ", considering literals " + literals);
System.err.println("Iteration ...");
for (int i = 0; i < numberOfIterations; i++) {
System.err.print(i + " ");
for (final String incomingKey : incomingPerPage.keySet()) {
final String incoming;
if (caseSensitive) {
incoming = incomingKey;
} else {
incoming = incomingKey.toLowerCase();
}
final List<String> incomingLinks = incomingPerPage.get(incoming);
double pageRank = 1.0d - dampingFactor;
for (final String inLink : incomingLinks) {
Double pageRankIn = (Double) pageRankScores.get(inLink);
if (pageRankIn == null) {
pageRankIn = Double.valueOf(startValue);
}
final double numberOut = (double) numberOutgoing.get(inLink);
pageRank += dampingFactor * (pageRankIn / numberOut);
}
pageRankScores.put(incoming, pageRank);
}
}
System.err.println();
System.err.println("Computing PageRank took " + (System.currentTimeMillis() - time) / 1000L + "s");
System.err.println("Total execution time: " + (System.currentTimeMillis() - initTime) / 1000L + "s");
}
public List<PageRankScore> getPageRankScores() {
List<PageRankScore> scores = new ArrayList<PageRankScore>();
Set<String> keysetNew = pageRankScores.keySet();
for (String string : keysetNew) {
PageRankScore s = new PageRankScore();
s.assignment = string;
s.score = pageRankScores.get(string);
scores.add(s);
}
return scores;
}
public void printPageRankScoresTSV(PrintWriter writer) {
Set<String> keysetNew = pageRankScores.keySet();
for (String string : keysetNew) {
writer.println(string + "\t" + String.format("%.10f", pageRankScores.get(string)));
}
}
public void printPageRankScoresRDF(PrintWriter writer) {
Set<String> keysetNew = pageRankScores.keySet();
for (String string : keysetNew) {
writer.println("<" + string + "> <http://purl.org/voc/vrank#pagerank> \""
+ String.format("%.10f", pageRankScores.get(string))
+ "\"^^<http://www.w3.org/2001/XMLSchema#float> .");
}
}
}
package install.pr;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import org.apache.jena.graph.Triple;
import org.apache.jena.riot.RDFDataMgr;
import org.apache.jena.riot.lang.PipedRDFIterator;
import org.apache.jena.riot.lang.PipedRDFStream;
import org.apache.jena.riot.lang.PipedTriplesStream;
/**
*
* @author WDAqua (https://github.com/WDAqua/PageRankRDF)
*
*/
public class Parser {
public static PipedRDFIterator<Triple> parse(String dump){
PipedRDFIterator<Triple> iter = new PipedRDFIterator<Triple>();
final String d=dump;
System.out.println("DUMP="+d);
final PipedRDFStream<Triple> inputStream = new PipedTriplesStream(iter);
// PipedRDFStream and PipedRDFIterator need to be on different threads
ExecutorService executor = Executors.newSingleThreadExecutor();
// Create a runnable for our parser thread
Runnable parser = new Runnable() {
@Override
public void run() {
// Call the parsing process.
RDFDataMgr.parse(inputStream, d);
}
};
// Start the parser on another thread
executor.submit(parser);
return iter;
}
}
package install.surfaceform.query;
import install.surfaceform.query.general.HSFQuery;
import structure.config.kg.EnumModelType;
import structure.interfaces.Executable;
public class HSFQueryExecutor implements Executable {
@Override
public void init() {
}
@Override
public boolean reset() {
return false;
}
@Override
public <T> T exec(Object... o) throws Exception {
final EnumModelType KG;
EnumModelType KGhelper = EnumModelType.DEFAULT;
if (o.length > 0) {
for (Object ob : o) {
if (ob instanceof EnumModelType) {
KGhelper = (EnumModelType) ob;
break;
}
}
}
KG = KGhelper;
KGhelper = null;
new HSFQuery(KG).execQueries();
return null;
}
@Override
public boolean destroy() {
return false;
}
}
package install.surfaceform.query;
import install.surfaceform.query.general.NP_HSFQuery;
import structure.config.kg.EnumModelType;
import structure.interfaces.Executable;
public class NP_HSFQueryExecutor implements Executable {
@Override
public void init() {
}
@Override
public boolean reset() {
return false;
}
@Override
public <T> T exec(Object... o) throws Exception {
final EnumModelType KG;
EnumModelType KGhelper = EnumModelType.DEFAULT;
if (o.length > 0) {
for (Object ob : o) {
if (ob instanceof EnumModelType) {
KGhelper = (EnumModelType) ob;
break;
}
}
}
KG = KGhelper;
KGhelper = null;
new NP_HSFQuery(KG).execQueries();
return null;
}
@Override
public boolean destroy() {
return false;
}
}
package install.surfaceform.query;
import install.surfaceform.query.general.NP_URL_HSFQuery;
import structure.config.kg.EnumModelType;
import structure.interfaces.Executable;
public class NP_URLHSFQueryExecutor implements Executable {
@Override
public void init() {
}
@Override
public <T> T exec(Object... o) throws Exception {
final EnumModelType KG;
EnumModelType KGhelper = EnumModelType.DEFAULT;
if (o.length > 0) {
for (Object ob : o) {
if (ob instanceof EnumModelType) {
KGhelper = (EnumModelType) ob;
break;
}
}
}
KG = KGhelper;
KGhelper = null;
new NP_URL_HSFQuery(KG).execQueries();
return null;
}
@Override
public boolean destroy() {
return false;
}
}