Commit bba6aea9 authored by Nandeesh Patel Gowdru Prabushanker's avatar Nandeesh Patel Gowdru Prabushanker
Browse files

Adding MAG(AGDISTIS) module

parent 1e831d89
.DS_Store
.git/
indexdbpedia*
src/
documents/
target/
log/
bin/
indextest/
src/test/resources/
.classpath
.settings/
.project
.metadata
.DS_Store
AIDA-YAGO2-dataset.tsv
*.jar
*.xml
*.ttl
lib/aida.jar
*.txt
indexdbpedia*
.idea
*.iws
*.iml
*.ipr
stages:
- test
- build
- build-docker
- test-docker
- release
variables:
TEST_IMAGE: aksw/agdistis:$CI_BUILD_REF_NAME
TEST_IMAGE_DICE: dicegroup/agdistis:$CI_BUILD_REF_NAME
IMAGE: aksw/agdistis:latest
IMAGE_DICE: dicegroup/agdistis:latest
test:
image: maven:3.5.3-jdk-10
stage: test
only:
- master
script:
- cp -r indextest index
- mvn clean test
build:
image: maven
stage: build
only:
- master
script:
- mvn clean package -Dmaven.test.skip=true
artifacts:
paths:
- target/*.war
build-docker:
image: docker:latest
stage: build-docker
services:
- docker:dind
only:
- master
before_script:
- docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASS
script:
- docker build --pull -t $TEST_IMAGE .
- docker push $TEST_IMAGE
- docker tag $TEST_IMAGE $TEST_IMAGE_DICE
- docker push $TEST_IMAGE_DICE
test-docker:
image: docker
stage: test-docker
only:
- master
services:
- docker:dind
script:
- docker pull $TEST_IMAGE
- docker run -d --name agdistis -v `pwd`/indextest:/usr/local/tomcat/index -v `pwd`/test-docker.sh:/usr/local/tomcat/test-docker.sh $TEST_IMAGE
- sleep 30
- docker exec agdistis sh /usr/local/tomcat/test-docker.sh
release:
image: docker:latest
stage: release
services:
- docker:dind
only:
- master
before_script:
- docker login -u $CI_REGISTRY_USER -p $CI_REGISTRY_PASS
script:
- docker pull $TEST_IMAGE
- docker tag $TEST_IMAGE $IMAGE
- docker push $IMAGE
- docker tag $TEST_IMAGE $IMAGE_DICE
- docker push $IMAGE_DICE
FROM tomcat:7-jre8
# Copy war file
COPY target/AGDISTIS*.war webapps/AGDISTIS.war
This diff is collapsed.
# AGDISTIS - Agnostic Named Entity Disambiguation
[![Build Status](https://gitlab.com/aksw/AGDISTIS/badges/master/build.svg)](https://gitlab.com/aksw/AGDISTIS/pipelines)
[![Project Stats](https://www.openhub.net/p/AGDISTIS/widgets/project_thin_badge.gif)](https://www.ohloh.net/p/AGDISTIS)
[![BCH compliance](https://bettercodehub.com/edge/badge/AKSW/AGDISTIS)](https://bettercodehub.com/)
>The new web services are available here:
>```
>en http://akswnc9.informatik.uni-leipzig.de:8113/AGDISTIS
>de http://akswnc9.informatik.uni-leipzig.de:8114/AGDISTIS
>es http://akswnc9.informatik.uni-leipzig.de:8115/AGDISTIS
>fr http://akswnc9.informatik.uni-leipzig.de:8116/AGDISTIS
>it http://akswnc9.informatik.uni-leipzig.de:8117/AGDISTIS
>ja http://akswnc9.informatik.uni-leipzig.de:8118/AGDISTIS
>nl http://akswnc9.informatik.uni-leipzig.de:8119/AGDISTIS
>pt http://akswnc9.informatik.uni-leipzig.de:8220/AGDISTIS
>zh http://139.18.2.164:8080/AGDISTIS_ZH
>wikidata http://akswnc9.informatik.uni-leipzig.de:8221/AGDISTIS
>```
This project aims at delivering a framework for disambiguating a priori annotated named entities.
More information about the project can be found <a href="http://aksw.org/projects/AGDISTIS">here</a> and in our <a href="https://github.com/AKSW/AGDISTIS/wiki">Wiki</a>.
Supplementary material can be found in the documents folder.
We hope you will enjoy using AGDISTIS!
### Support and Feedback
If you need help or you have questions do not hesitate to write an email to <a href="mailto:usbeck@uni-paderborn.de"> Dr. Ricardo Usbeck</a>. Or use the issue tracker in the right sidebar.
### How to cite
```Tex
@InProceedings{Moussallem2017,
author = {Diego Moussallem and Ricardo Usbeck and Michael R{\"o}der and Axel-Cyrille {Ngonga Ngomo}},
title = {{MAG: A Multilingual, Knowledge-base Agnostic and Deterministic Entity Linking Approach}},
booktitle = {K-CAP 2017: Knowledge Capture Conference},
year = {2017},
pages = {8},
organization = {ACM},
url = {https://svn.aksw.org/papers/2017/KCAP_MAG/sigconf-main.pdf},
}
@incollection{AGDISTIS_ISWC,
author = {Usbeck, Ricardo and {Ngonga Ngomo}, Axel-Cyrille and Auer, S{\"o}ren and Gerber, Daniel and Both, Andreas},
booktitle = {13th International Semantic Web Conference},
title = {AGDISTIS - Graph-Based Disambiguation of Named Entities using Linked Data},
url = {http://svn.aksw.org/papers/2014/ISWC_AGDISTIS/public.pdf},
year = 2014
}
```
### Acknowlegements
The first version of this work was supported by the ESF and the Free State of Saxony.
AGDISTIS is now supported by the German Federal Ministry of Education and Research and EuroStars.
### Annotation Tool
The used annotation tool can be downloaded from <a href="https://github.com/RicardoUsbeck/QRTool">here</a>.
### Disclaimer
The deployed webservice does not reflect the optimal parametrization of AGDISTIS as published.
### Bindings
* Python bindings: https://pypi.python.org/pypi/agdistispy/
### Running AGDISTIS
### How to run
```
mvn clean package tomcat:run
```
For more information, go to our <a href="https://github.com/AKSW/AGDISTIS/wiki/3-Running-the-webservice">Wiki</a>.
#!/bin/bash
mkdir de
mkdir en
mkdir es
mkdir fr
mkdir it
mkdir ja
mkdir nl
scp -r RicardoUsbeck@akswnc9.informatik.uni-leipzig.de:/home/JonathanEberle/agdistis/de/logs de
scp -r RicardoUsbeck@akswnc9.informatik.uni-leipzig.de:/home/JonathanEberle/agdistis/en/logs en
scp -r RicardoUsbeck@akswnc9.informatik.uni-leipzig.de:/home/JonathanEberle/agdistis/es/logs es
scp -r RicardoUsbeck@akswnc9.informatik.uni-leipzig.de:/home/JonathanEberle/agdistis/fr/logs fr
scp -r RicardoUsbeck@akswnc9.informatik.uni-leipzig.de:/home/JonathanEberle/agdistis/it/logs it
scp -r RicardoUsbeck@akswnc9.informatik.uni-leipzig.de:/home/JonathanEberle/agdistis/ja/logs ja
scp -r RicardoUsbeck@akswnc9.informatik.uni-leipzig.de:/home/JonathanEberle/agdistis/nl/logs nl
#!/bin/bash
for f in de/logs/*.log
do
grep 'AGDISTIS\t' $f | awk '{print $2}' >> calldate_de.txt
done
wc -l calldate_de.txt
for f in en/logs/*.log
do
grep 'AGDISTIS\t' $f | awk '{print $2}' >> calldate_en.txt
done
wc -l calldate_en.txt
for f in es/logs/*.log
do
grep 'AGDISTIS\t' $f | awk '{print $2}' >> calldate_es.txt
done
wc -l calldate_es.txt
for f in fr/logs/*.log
do
grep 'AGDISTIS\t' $f | awk '{print $2}' >> calldate_fr.txt
done
wc -l calldate_fr.txt
for f in it/logs/*.log
do
grep 'AGDISTIS\t' $f | awk '{print $2}' >> calldate_it.txt
done
wc -l calldate_it.txt
for f in ja/logs/*.log
do
grep 'AGDISTIS\t' $f | awk '{print $2}' >> calldate_ja.txt
done
wc -l calldate_ja.txt
for f in 'nl/logs/*.log'
do
grep 'AGDISTIS\t' $f | awk '{print $2}' >> calldate_nl.txt
done
wc -l calldate_nl.txt
package org.aksw.agdistis.algorithm;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.Properties;
public class CorporationAffixCleaner {
HashSet<String> corporationAffixes = new HashSet<String>();
public CorporationAffixCleaner() throws IOException {
Properties prop = new Properties();
InputStream input = CorporationAffixCleaner.class.getResourceAsStream("/config/agdistis.properties");
prop.load(input);
String envCorpAffixes = System.getenv("AGDISTIS_CORPORATION_AFFIXES");
String file = envCorpAffixes != null ? envCorpAffixes : prop.getProperty("corporationAffixes");
loadCorporationAffixes(file);
}
private void loadCorporationAffixes(String file) throws IOException {
BufferedReader br = new BufferedReader(
new InputStreamReader(CorporationAffixCleaner.class.getResourceAsStream(file)));
while (br.ready()) {
String line = br.readLine();
corporationAffixes.add(line);
}
br.close();
}
String cleanLabelsfromCorporationIdentifier(String label) {
for (String corporationAffix : corporationAffixes) {
if (label.endsWith(corporationAffix)) {
label = label.substring(0, label.lastIndexOf(corporationAffix));
}
}
return label.trim();
}
}
package org.aksw.agdistis.algorithm;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.HashSet;
import java.util.List;
import java.util.Properties;
import org.aksw.agdistis.util.Triple;
import org.aksw.agdistis.util.TripleIndex;
public class DomainWhiteLister {
private TripleIndex index;
HashSet<String> whiteList = new HashSet<String>();
public DomainWhiteLister(TripleIndex index) throws IOException {
Properties prop = new Properties();
InputStream input = DomainWhiteLister.class.getResourceAsStream("/config/agdistis.properties");
prop.load(input);
String envWhiteList = System.getenv("AGDISTIS_WHITELIST");
String file = envWhiteList != null ? envWhiteList : prop.getProperty("whiteList");
loadWhiteDomains(file);
this.index = index;
}
private void loadWhiteDomains(String file) throws IOException {
BufferedReader br = new BufferedReader(
new InputStreamReader(DomainWhiteLister.class.getResourceAsStream(file)));
while (br.ready()) {
String line = br.readLine();
whiteList.add(line);
}
br.close();
}
public boolean fitsIntoDomain(String candidateURL) {
List<Triple> tmp = index.search(candidateURL, "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", null);
if (tmp.isEmpty())
return true;
for (Triple triple : tmp) {
if (!triple.getObject().contains("wordnet") && !triple.getObject().contains("wikicategory"))
if (whiteList.contains(triple.getObject())) {
return true;
}
}
return false;
}
}
package org.aksw.agdistis.algorithm;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import org.aksw.agdistis.datatypes.Document;
import org.aksw.agdistis.datatypes.NamedEntitiesInText;
import org.aksw.agdistis.datatypes.NamedEntityInText;
import org.aksw.agdistis.graph.BreadthFirstSearch;
import org.aksw.agdistis.graph.HITS;
import org.aksw.agdistis.graph.Node;
import org.aksw.agdistis.graph.PageRank;
import org.aksw.agdistis.model.CandidatesScore;
import org.aksw.agdistis.util.TripleIndex;
import org.aksw.agdistis.util.TripleIndexContext;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import edu.uci.ics.jung.graph.DirectedSparseGraph;
public class NEDAlgo_HITS {
private Logger log = LoggerFactory.getLogger(NEDAlgo_HITS.class);
private String edgeType;
private String nodeType;
private CandidateUtil cu;
private TripleIndex index;
private TripleIndexContext indexByContext;
// needed for the experiment about which properties increase accuracy
private double threshholdTrigram;
private int maxDepth;
private Boolean heuristicExpansionOn;
private String algorithm;
private boolean context;
public NEDAlgo_HITS() throws IOException {
Properties prop = new Properties();
InputStream input = NEDAlgo_HITS.class.getResourceAsStream("/config/agdistis.properties");
prop.load(input);
String envNodeType = System.getenv("AGDISTIS_NODE_TYPE");
String nodeType = envNodeType != null ? envNodeType : prop.getProperty("nodeType");
String envEdgeType = System.getenv("AGDISTIS_EDGE_TYPE");
String edgeType = envEdgeType != null ? envEdgeType : prop.getProperty("edgeType");
String envThresholdTrigram = System.getenv("AGDISTIS_THRESHHOLD_TRIGRAM");
double threshholdTrigram = Double.valueOf(envThresholdTrigram != null ? envThresholdTrigram : prop.getProperty("threshholdTrigram"));
String envMaxDepth = System.getenv("AGDISTIS_MAX_DEPTH");
int maxDepth = Integer.valueOf(envMaxDepth != null ? envMaxDepth : prop.getProperty("maxDepth"));
String envHeuristicExpansion = System.getenv("AGDISTIS_HEURISTIC_EXPANSION_ON");
this.heuristicExpansionOn = Boolean.valueOf(envHeuristicExpansion != null ? envHeuristicExpansion : prop.getProperty("heuristicExpansionOn"));
String envAlgorithm = System.getenv("AGDISTIS_ALGORITHM");
this.algorithm = envAlgorithm != null ? envAlgorithm : prop.getProperty("algorithm");
this.nodeType = nodeType;
this.edgeType = edgeType;
this.threshholdTrigram = threshholdTrigram;
this.maxDepth = maxDepth;
this.cu = new CandidateUtil();
this.index = cu.getIndex();
String envContext = System.getenv("AGDISTIS_CONTEXT");
this.context = Boolean.valueOf(envContext != null ? envContext : prop.getProperty("context"));
if (context == true) { // in case the index by context exist
this.indexByContext = cu.getIndexContext();
}
}
public void run(Document document, Map<NamedEntityInText, List<CandidatesScore>> candidatesPerNE) {
try {
NamedEntitiesInText namedEntities = document.getNamedEntitiesInText();
DirectedSparseGraph<Node, String> graph = new DirectedSparseGraph<Node, String>();
// 0) insert candidates into Text
log.debug("\tinsert candidates");
cu.insertCandidatesIntoText(graph, document, threshholdTrigram, heuristicExpansionOn);
// 1) let spread activation/ breadth first search run
log.info("\tGraph size before BFS: " + graph.getVertexCount());
BreadthFirstSearch bfs = new BreadthFirstSearch(index, algorithm);
bfs.run(maxDepth, graph, edgeType, nodeType);
log.info("\tGraph size after BFS: " + graph.getVertexCount());
if (algorithm.equals("hits")) {
// 2.1) let HITS run
log.info("\trun HITS");
HITS h = new HITS();
h.runHits(graph, 20);
} else if (algorithm.equals("pagerank")) {
// 2.2) let Pagerank run
log.info("\trun PageRank");
PageRank pr = new PageRank();
pr.runPr(graph, 50, 0.1);
}
// 3) store the candidate with the highest hub, highest authority
// ratio
// manipulate which value to use directly in node.compareTo
log.debug("\torder results");
ArrayList<Node> orderedList = new ArrayList<Node>();
orderedList.addAll(graph.getVertices());
Collections.sort(orderedList);
for (NamedEntityInText entity : namedEntities) {
for (int i = 0; i < orderedList.size(); i++) {
Node m = orderedList.get(i);
// there can be one node (candidate) for two labels
if (m.containsId(entity.getStartPos())) {
entity.setNamedEntity(m.getCandidateURI());
break;
}
}
}
// To get all candidates along with their scores
if (candidatesPerNE != null) {
for (NamedEntityInText entity : namedEntities) {
List<CandidatesScore> listCandidates = new ArrayList<>();
for (int i = 0; i < orderedList.size(); i++) {
Node m = orderedList.get(i);
// there can be one node (candidate) for two labels
if (m.containsId(entity.getStartPos())) {
CandidatesScore candidates = new CandidatesScore();
candidates.setStart(entity.getStartPos());
candidates.setUri(m.getCandidateURI());
candidates.setScore(m.getAuthorityWeight());
listCandidates.add(candidates);
}
}
candidatesPerNE.put(entity, listCandidates);
}
}
} catch (Exception e) {
log.error("AGDISTIS cannot be run on this document.", e);
}
}
public void close() throws IOException {
index.close();
if (context == true) {
indexByContext.close();
}
}
public void setThreshholdTrigram(double threshholdTrigram) {
this.threshholdTrigram = threshholdTrigram;
}
public void setMaxDepth(int maxDepth) {
this.maxDepth = maxDepth;
}
public void setHeuristicExpansionOn(Boolean value) {
this.heuristicExpansionOn = value;
}
public String getEdgeType() {
return edgeType;
}
public void setEdgeType(String edgeType) {
this.edgeType = edgeType;
}
public String getNodeType() {
return nodeType;
}
public void setNodeType(String nodeType) {
this.nodeType = nodeType;
this.cu.setNodeType(nodeType);
}
public void setIndex(TripleIndex index) {
this.index = index;
this.cu.setIndex(index);
}
}
package org.aksw.agdistis.algorithm;
import java.util.Comparator;
import org.aksw.agdistis.datatypes.NamedEntityInText;
/**
* Comparator for sorting Named Entities according to their length
*
* @author r.usbeck
*
*/
public class NamedEntityLengthComparator implements Comparator<NamedEntityInText> {
@Override
public int compare(NamedEntityInText o1, NamedEntityInText o2) {
return Double.compare(o1.getLength(), o2.getLength());
}
}
package org.aksw.agdistis.datatypes;
public class Candidate implements Comparable<Candidate> {
public static final int NO_ENTITY_CANDIDATE_ID = -1;
public static final int OTHER_ENTITY_CANDIDATE_ID = -2;
private int id;
private String url;
private String label;
private String description;
private int outgoingEdgeCount;
public Candidate(String url, String label, String description) {
this(NO_ENTITY_CANDIDATE_ID, url, label, description);
}
public Candidate(int id, String url, String label, String description) {
this(id, url, label, description, -1);
}
public Candidate(String url, String label, String description, int outgoingEdgeCount) {
this(NO_ENTITY_CANDIDATE_ID, url, label, description, outgoingEdgeCount);