Repository: dbpedia/lookup Branch: master Commit: c13d8fcf6cc1 Files: 30 Total size: 86.2 KB Directory structure: gitextract_g106cma5/ ├── .gitattributes ├── .gitignore ├── .travis.yml ├── LICENSE ├── README.md ├── docker/ │ ├── Dockerfile_2015-10 │ └── README.md ├── pom.xml ├── run ├── scripts/ │ └── index.sh └── src/ ├── main/ │ ├── resources/ │ │ └── logback.xml │ └── scala/ │ └── org/ │ └── dbpedia/ │ └── lookup/ │ ├── entities/ │ │ ├── Entities.scala │ │ └── EntitiesSerialization.scala │ ├── inputformat/ │ │ ├── DBpediaNTriplesInputFormat.scala │ │ ├── InputFormat.scala │ │ ├── PignlprocTSVInputFormat.scala │ │ └── WikiStatsExtractor.scala │ ├── lucene/ │ │ ├── Indexer.scala │ │ ├── LuceneConfig.scala │ │ └── Searcher.scala │ ├── server/ │ │ ├── LookupResource.scala │ │ └── Server.scala │ └── util/ │ └── Logging.scala └── test/ ├── resources/ │ ├── data.nt │ ├── logback-test.xml │ └── redirects.nt └── scala/ └── org/ └── dbpedia/ └── lookup/ ├── IntegrationTest.scala ├── TestUtils.scala └── entities/ ├── EntitiesSerializationTest.scala └── EntitiesTest.scala ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitattributes ================================================ # normalize all text # when committed they are stored with LF, # on checkout they are converted to the OS's native line endings # https://help.github.com/articles/dealing-with-line-endings *.java text *.scala text *.sh text *.xml text *.json text *.nt text *.md text ================================================ FILE: .gitignore ================================================ .idea syntax: glob *.iml *.ipr *.iws target/* ================================================ FILE: .travis.yml ================================================ language: java jdk: - oraclejdk8 # Workaround for https://github.com/travis-ci/travis-ci/issues/5227 # Buffer overflow in Java_java_net_Inet4AddressImpl_getLocalHostName before_install: - cat /etc/hosts # optionally check the content *before* - sudo hostname "$(hostname | cut -c1-63)" - sed -e "s/^\\(127\\.0\\.0\\.1.*\\)/\\1 $(hostname | cut -c1-63)/" /etc/hosts | sudo tee /etc/hosts - cat /etc/hosts # optionally check the content *after* branches: only: - master install: /bin/true script: "mvn clean install" notifications: email: recipients: - dbpedia-developers@lists.sourceforge.net on_success: change on_failure: change ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================ # IMPORTANT NOTE: There is a newer and DBpedia Databus compatible version of the DBpedia Lookup here: https://github.com/dbpedia/dbpedia-lookup. The discussion concerning the transition to the new service can be found here: https://forum.dbpedia.org/t/new-dbpedia-lookup-application/607 # DBpedia Lookup [![Build Status](https://travis-ci.org/dbpedia/lookup.svg?branch=master)](https://travis-ci.org/dbpedia/lookup) DBpedia Lookup is a web service that can be used to look up DBpedia URIs by related keywords. Related means that either the label of a resource matches, or an anchor text that was frequently used in Wikipedia to refer to a specific resource matches (for example the resource http://dbpedia.org/resource/United_States can be looked up by the string "USA"). The results are ranked by the number of inlinks pointing from other Wikipedia pages at a result page. ## Web APIs Two APIs are offered: Keyword Search and Prefix Search. A hosted version of the Lookup service is available on the DBpedia server infrastructure. ### Keyword Search The Keyword Search API can be used to find related DBpedia resources for a given string. The string may consist of a single or multiple words. Example: Places that have the related keyword "berlin" http://lookup.dbpedia.org/api/search/KeywordSearch?QueryClass=place&QueryString=berlin ### Prefix Search (i.e. Autocomplete) The Prefix Search API can be used to implement autocomplete input boxes. For a given partial keyword like *berl* the API returns URIs of related DBpedia resources like http://dbpedia.org/resource/Berlin. Example: Top five resources for which a keyword starts with "berl" http://lookup.dbpedia.org/api/search/PrefixSearch?QueryClass=&MaxHits=5&QueryString=berl ### Parameters The query parameters accepted by the endpoints are * `QueryString`: a string for which a DBpedia URI should be found. * `QueryClass`: a DBpedia class from the Ontology that the results should have (for owl#Thing and untyped resource, leave this parameter empty). * `MaxHits`: the maximum number of returned results (default: 5) ### JSON support By default all data is returned as XML, the service also retuns JSON to any request including the `Accept: application/json` header. ## Running a local mirror of the webservice ### Clone and build DBpedia Lookup git clone git://github.com/dbpedia/lookup.git cd lookup mvn clean install ### Download and configure the index You can get our indexes from [HERE](http://downloads.dbpedia-spotlight.org/dbpedia_lookup/) ### Run the server `./run Server [PATH TO THE INDEX]/[VERSION]/` E.g: `./run Server /opt/dbpedia-lookup/2015-04` **Note: The index file must be decompressed** #### Available versions: * current - from Latest DBpedia Dump (2015-10) #### Available languages (i18n working in progress): * en - English The server should now be running at http://localhost:1111 ## Rebuilding the index Rebuilding an index is usually not required, if you only intend on running a local mirror of the service you can donwload a prebuilt index as outlined above. To re-build the index you will require * DBpedia datasets * [Wikistatsextractor output](http://downloads.dbpedia-spotlight.org) - [wikistatsextractor](https://github.com/jodaiber/wikistatsextractor) is a drop-in replacement of [pignlproc](https://github.com/dbpedia-spotlight/pignlproc) * Unix ### Get the following DBpedia datasets from http://downloads.dbpedia.org/2015-10/core-i18n/en/ * redirects\_en.nt (or .ttl) * short\_abstracts\_en.nt (or .ttl) * instance\_types\_en.nt (or .ttl) * article\_categories\_en.nt (or .ttl) from http://downloads.dbpedia.org/2015-10/core * instance_types_en.ttl * instance_types_sdtyped_dbo_en.ttl * instance_types_transitive_en.ttl ### Concatenate all data and sort by URI This is necessary because indexing in sorted order is significantly faster. cat instance_types_en.nt (or .ttl) \ short_abstracts_en.nt (or .ttl) \ article_categories_en.nt (or .ttl) \ instance_types_en.ttl \ instance_types_sdtyped_dbo_en.ttl \ instance_types_transitive_en.ttl | sort >all_dbpedia_data.nt (or .ttl) ### Get the dataset redirects\_en.nt (or .ttl) Redirects are not indexed, but they are excluded as targets of lookup. ### Run Indexer The indexer has to be run twice: 1. with the DBpedia data ./run Indexer lookup_index_dir redirects_en.nt (or .ttl) all_dbpedia_data.nt (or .ttl) 2. with the wikistatsextractor data ./run Indexer lookup_index_dir redirects_en.nt (or .ttl) pairCounts ## Support and feedback The best way to get support or give feedback on the Lookup project is via the [DBpedia discussion mailing list](https://lists.sourceforge.net/lists/listinfo/dbpedia-discussion). More technical queries about the code base should be directed to the [DBpedia developers mailing list](https://lists.sourceforge.net/lists/listinfo/dbpedia-developers). The [DBpedia wiki](http://wiki.dbpedia.org/lookup/) also has useful information on the project. ## Maintainers * Kunal Jha [@Kunal-Jha](https://github.com/Kunal-Jha) * Sandro Coelho [@sandroacoelho](https://github.com/sandroacoelho) * Pablo Mendes [@pablomendes](https://github.com/pablomendes) (less active) * Max Jakob [@maxjakob](https://github.com/maxjakob) (less active) * Matt Haynes [@matth](https://github.com/matth) (less active) ================================================ FILE: docker/Dockerfile_2015-10 ================================================ FROM java:8 MAINTAINER DBpedia Team RUN apt-get update && apt-get install -y \ curl ENV INDEX_URL downloads.dbpedia-spotlight.org/dbpedia_lookup/models ENV INDEX_FILENAME 2015-10.tar.gz ENV LOOKUP_JAR dbpedia-lookup-3.1-jar-with-dependencies.jar ENV LOOKUP_URL downloads.dbpedia-spotlight.org/dbpedia_lookup/ RUN mkdir -p /opt/lookup && \ cd /opt/lookup && \ wget "http://$LOOKUP_URL/$LOOKUP_JAR" -O $LOOKUP_JAR && \ wget "http://$INDEX_URL/$INDEX_FILENAME" -O $INDEX_FILENAME && \ tar xvf $INDEX_FILENAME && \ rm $INDEX_FILENAME EXPOSE 1111 ================================================ FILE: docker/README.md ================================================ ## Supported tags and respective Dockerfile links * latest (DBpedia dump 2015-10) ## How to run * English - ``docker run -p 1111:1111 -it dbpedia/lookup java -jar /opt/lookup/dbpedia-lookup-3.1-jar-with-dependencies.jar /opt/lookup/2015-10/`` And then try ``` http://localhost:1111/api/search/PrefixSearch?QueryClass=&MaxHits=5&QueryString=berl ``` If you are using Docker Compose you can do the same with this minimal Compose file: ```yml version: '2' services: lookup: container_name: lookup image: dbpedia/lookup ports: - "1111:1111" command: java -jar /opt/lookup/dbpedia-lookup-3.1-jar-with-dependencies.jar /opt/lookup/2015-10/ ``` ## Supported Docker versions This image is officially supported on Docker version 1.9.1. Please see the [Docker installation documentation] (https://docs.docker.com/installation/) for details on how to upgrade your Docker daemon. ## Issues If you have any problems with or questions about this image, please contact us through a [GitHub issue](http://github.com/dbpedia/lookup/issues). ## Contributing First of all, thank you for helping! :) . Please see [DBpedia Contribute Guide](https://github.com/dbpedia/lookup/wiki/Contributing) for details on how to contribute ================================================ FILE: pom.xml ================================================ 4.0.0 org.dbpedia.lookup dbpedia-lookup jar 3.1 DBpedia Lookup Service DBpedia Lookup is a web service that can be used to look up DBpedia URIs by related keywords http://www.dbpedia.org Apache License, Version 2.0 http://www.apache.org/licenses/LICENSE-2.0.txt repo DBpedia Team dbpedia-developers@lists.sourceforge.net DBpedia http://www.dbpedia.org scm:git:git@github.com:dbpedia/extraction-framework.git scm:git:git@github.com:dbpedia/lookup.git git@github.com:dbpedia/extraction-framework.git UTF-8 UTF-8 3.2.2 2.12.4 2.11.7 2.11 2.2.6 1.0 4.0 3.6.2 1.19.1 1.0 2.6.2 1.0.9 1.2.3 AKSW.GPG maven-resources-plugin 2.6 process-test-resources process-test-resources testResources copy-resources compile resources net.alchim31.maven scala-maven-plugin ${maven.scala.plugin.version} compile compile compile test-compile testCompile test-compile attach-docs-sources add-source doc-jar -deprecation Server org.dbpedia.lookup.server.Server -Xmx512m -Dlogback.configurationFile=logback.xml -Dhttp.port=1111 Indexer org.dbpedia.lookup.lucene.Indexer -Xms5g -Xmx5g -Dlogback.configurationFile=logback.xml org.scalatest scalatest-maven-plugin ${maven.scala.test.plugin.version} test test maven-assembly-plugin package single true org.dbpedia.lookup.server.Server jar-with-dependencies org.sonatype.plugins nexus-staging-maven-plugin true ossrh https://oss.sonatype.org/ true org.apache.maven.plugins maven-source-plugin attach-sources jar org.scala-lang scala-compiler ${scala.compiler.version.revision} provided org.scala-lang scala-library ${scala.compiler.version.revision} org.dbpedia.extraction core ${dbpedia.extraction.version} org.apache.lucene lucene-core ${lucene.core.version} com.sun.jersey jersey-bundle ${jersey.server.version} org.apache.commons commons-compress ${commons.compress.version} net.liftweb lift-json_${scala.compiler.version} ${lift.json.version} ch.qos.logback logback-classic ${logback.classic.version} org.semanticweb.yars nxparser ${yars.nxparser.version} org.scalatest scalatest_${scala.compiler.version} ${scala.test.version} test spotlight-releases-repository https://github.com/dbpedia-spotlight/maven-repo/raw/master/releases sonatype-oss-public Sonatype OSS Public Maven repo https://oss.sonatype.org/content/groups/public sonatype-oss-public Sonatype OSS Public Maven repo https://oss.sonatype.org/content/groups/public release org.apache.maven.plugins maven-gpg-plugin 1.6 sign-artifacts verify sign AKSW ${aksw.keyname} org.apache.maven.plugins maven-release-plugin 2.5.2 deploy true false release v@{project.version} [maven-release-plugin] [ci build-cli] [ci build-webdemo] - ================================================ FILE: run ================================================ #!/bin/bash # Shortcut for mvn scala:run -Dlauncher=... -DaddArgs=... # Example: # lookup> ./run LAUNCHER ARG1 ARG2 ARG3 # is equivalent to # lookup> mvn scala:run "-Dlauncher=LAUNCHER" "-DaddArgs=ARG1|ARG2|ARG3" LAUNCHER="$1" ADD_ARGS="$2" for ARG in ${@:3} do ADD_ARGS="$ADD_ARGS|$ARG" done mvn scala:run "-Dlauncher=$LAUNCHER" "-DaddArgs=$ADD_ARGS" ================================================ FILE: scripts/index.sh ================================================ #!/bin/bash readonly DBPEDIA_VERSION=$1 readonly LANG_i18n=$2 readonly DBPEDIA_DOWNLOADS="http://downloads.dbpedia.org"/$DBPEDIA_VERSION/core-i18n readonly DBPEDIA_ROOT=~/lookup readonly DBPEDIA_DATA=$DBPEDIA_ROOT/dbpedia_data/$DBPEDIA_VERSION readonly DBPEDIA_INDEX=dbpedia-lookup-index/$LANG_i18n/$DBPEDIA_VERSION readonly ALL_FILES=(redirects short_abstracts instance_types article_categories) #+------------------------------------------------------------------------------------------------------------------------------+ #| Functions | #+------------------------------------------------------------------------------------------------------------------------------+ # Error_exit function by William Shotts. http://stackoverflow.com/questions/64786/error-handling-in-bash function error_exit { echo -e "${PROGNAME}: ${1:-"Unknown Error"}" 1>&2 exit 1 } # The function used to create all the directories needed function create_dir() { if [ -e $1 ]; then echo -e $1" already exists. Skipping creating this directory!" else mkdir -p $1 fi } # A helper function to download files from a given path. The first parameter is the path from where to download the file # without the file name, the second states the file name, and the third is where to save that file function download_file() { # Only downloads if there is no current file or there is a newer version echo "$#" case "$#" in "3") wget -q --spider $1/$2 if [ $? -eq 0 ] ; then wget -N $1/$2 --directory-prefix=$3 else # The file can't be found. We can extract a substring with the file name and show it to the user error_exit "ERROR: The file '"$2"' cannot be found for download.\n" fi ;; "4") wget -q --spider $1 $2/$3 if [ $? -eq 0 ] ; then wget -N $1 $2/$3 --directory-prefix=$4 else # The file can't be found. We can extract a substring with the file name and show it to the user error_exit "ERROR: The file '"$3"' cannot be found for download.\n" fi ;; *) error_exit "ERROR: Incorrect number of parameters!"; esac echo -e "done!\n" } #-----------------------------------------------------------------------------------------------------------------------------+ create_dir $DBPEDIA_DATA create_dir $DBPEDIA_ROOT/$DBPEDIA_INDEX for i in ${ALL_FILES[@]} do download_file $DBPEDIA_DOWNLOADS/$LANG_i18n ${i}_$LANG_i18n.ttl.bz2 $DBPEDIA_DATA/$LANG_i18n done for i in ${ALL_FILES[@]} do bunzip2 -dc $DBPEDIA_DATA/$LANG_i18n/${i}_$LANG_i18n.ttl.bz2 > $DBPEDIA_DATA/$LANG_i18n/${i}_$LANG_i18n.nt done cat $DBPEDIA_DATA/$LANG_i18n/short_abstracts_$LANG_i18n.nt $DBPEDIA_DATA/$LANG_i18n/instance_types_$LANG_i18n.nt $DBPEDIA_DATA/$LANG_i18n/article_categories_$LANG_i18n.nt > $DBPEDIA_DATA/$LANG_i18n/all_dbpedia_data.nt git clone https://github.com/dbpedia/lookup.git cd lookup mvn clean install ./run Indexer $DBPEDIA_ROOT/$DBPEDIA_INDEX $DBPEDIA_DATA/$LANG_i18n/redirects_$LANG_i18n.nt $DBPEDIA_DATA/$LANG_i18n/all_dbpedia_data.nt cd $DBPEDIA_ROOT tar -zcvf ${LANG_i18n}_$DBPEDIA_VERSION.tar.gz $DBPEDIA_INDEX ================================================ FILE: src/main/resources/logback.xml ================================================ %-5level - %msg%n ================================================ FILE: src/main/scala/org/dbpedia/lookup/entities/Entities.scala ================================================ package org.dbpedia.lookup.entities import org.dbpedia.extraction.util.WikiUtil._ trait Uri { val uri : String } trait Label { val label : String } case class Redirect(uri: String) extends Uri case class Template(uri: String) extends Uri case class Category(uri: String) extends Uri with Label { val label: String = wikiDecode(uri.replace("http://dbpedia.org/resource/Category:", "")) } case class OntologyClass(uri: String) extends Uri with Label { val label: String = { if (uri endsWith "owl#Thing") { "owl#Thing" } else { val s = wikiDecode(uri.replace("http://dbpedia.org/ontology/", "") .replace("http://schema.org/", "") ) s.replaceAll("([A-Z])", " $1").trim.toLowerCase } } } case class Result( uri: String, description: String, classes: Set[OntologyClass], categories: Set[Category], templates: Set[Template], redirects: Set[Redirect], refCount: Int ) extends Uri with Label { val label: String = wikiDecode(uri.replace("http://dbpedia.org/resource/", "")) } ================================================ FILE: src/main/scala/org/dbpedia/lookup/entities/EntitiesSerialization.scala ================================================ package org.dbpedia.lookup.entities import scala.xml._ import net.liftweb.json._ trait ResultSerializer { def prettyPrint(results: Traversable[Result]) : String } class ResultJsonSerializer extends ResultSerializer { def prettyPrint(results: Traversable[Result]) : String = { import net.liftweb.json.JsonDSL._ val json = ("results" -> results.map { result => ("uri" -> result.uri) ~ ("label" -> result.label) ~ ("description" -> result.description) ~ ("refCount" -> result.refCount) ~ ("classes" -> result.classes.map(c => ("uri" -> c.uri) ~ ("label" -> c.label))) ~ ("categories" -> result.categories.map(c => ("uri" -> c.uri) ~ ("label" -> c.label))) ~ ("templates" -> result.templates.map(c => ("uri" -> c.uri))) ~ ("redirects" -> result.redirects.map(c => ("uri" -> c.uri))) }) pretty(render(json)) } } class ResultXmlSerializer extends ResultSerializer { def prettyPrint(results: Traversable[Result]) : String = { val xml = serialize(results) /* val printer = new scala.xml.PrettyPrinter(120, 4) "\n" + printer.format(xml)*/ "\n" + xml.toString() } def serialize(results : Traversable[Result]) : Node = { { results.map(r => serialize(r)) } } def serialize(result : Result) : Node = { {result.uri} {result.description} { urisWithLabels(result.classes, "Class") } { urisWithLabels(result.categories, "Category") } { uris(result.templates, "Template") } { uris(result.redirects, "Redirect") } { result.refCount } } private def urisWithLabels[A <: Uri with Label](items: Set[A], nodeName: String) = { items.map(item => new Elem(null, nodeName, Null, TopScope, , {item.uri})) } private def uris[A <: Uri](items: Set[A], nodeName: String) = { items.map(item => new Elem(null, nodeName, Null, TopScope, {item.uri})) } } ================================================ FILE: src/main/scala/org/dbpedia/lookup/inputformat/DBpediaNTriplesInputFormat.scala ================================================ package org.dbpedia.lookup.inputformat import org.semanticweb.yars.nx.parser.NxParser import java.io.InputStream import org.dbpedia.lookup.lucene.LuceneConfig /** * Class to itereate over DBpedia NTriples dataset and */ class DBpediaNTriplesInputFormat(val dataSet: InputStream, val redirects: scala.collection.Set[String]) extends InputFormat { private val it = new NxParser(dataSet) val predicate2field = Map( "http://lexvo.org/ontology#label" -> LuceneConfig.Fields.SURFACE_FORM_KEYWORD, // no DBpedia dataset, has to be created "http://dbpedia.org/property/refCount" -> LuceneConfig.Fields.REFCOUNT, // no DBpedia dataset, has to be created "http://dbpedia.org/ontology/abstract" -> LuceneConfig.Fields.DESCRIPTION, "http://www.w3.org/2000/01/rdf-schema#comment" -> LuceneConfig.Fields.DESCRIPTION, "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" -> LuceneConfig.Fields.CLASS, "http://purl.org/dc/terms/subject" -> LuceneConfig.Fields.CATEGORY, "http://dbpedia.org/property/wikiPageUsesTemplate" -> LuceneConfig.Fields.TEMPLATE, // not really necessary "http://dbpedia.org/ontology/wikiPageRedirects" -> LuceneConfig.Fields.REDIRECT // not really necessary ) override def foreach[U](f: ((String,String,String)) => U) { while(it.hasNext) { val triple = it.next val uri = triple(0).toString val pred = triple(1).toString val obj = triple(2).toString predicate2field.get(pred) match { case Some(field: String) if(redirects.isEmpty || !redirects.contains(uri)) => { if(field == LuceneConfig.Fields.REDIRECT) { f( (obj, field, uri) ) // make it a "hasRedirect" relation } else { f( (uri, field, obj) ) } } case _ => } } } } ================================================ FILE: src/main/scala/org/dbpedia/lookup/inputformat/InputFormat.scala ================================================ package org.dbpedia.lookup.inputformat trait InputFormat extends Traversable[(String, String, String)] { } ================================================ FILE: src/main/scala/org/dbpedia/lookup/inputformat/PignlprocTSVInputFormat.scala ================================================ package org.dbpedia.lookup.inputformat import java.io.InputStream import org.dbpedia.lookup.lucene.LuceneConfig import io.Source import org.dbpedia.extraction.util.WikiUtil /** * Class to itereate over a pignlproc nerd-stats result. */ class PignlprocTSVInputFormat(dataSet: InputStream, pSfGivenUriThreshold: Double, uriField: Int=0, sfField: Int=1, pSfGivenUriField: Int=3, refCountField: Int=6) extends InputFormat { val DBPEDIA_RESOURCE_NAMESPACE = "http://dbpedia.org/resource/" private val it = Source.fromInputStream(dataSet, "utf-8").getLines() override def foreach[U](f: ((String,String,String)) => U) { while(it.hasNext) { val elements = it.next().split("\t") val uri = DBPEDIA_RESOURCE_NAMESPACE + WikiUtil.wikiEncode(elements(uriField)) val sf = elements(sfField) //val pUriGivenSf = elements(2) val pSfGivenUri = elements(pSfGivenUriField) //val pSf = elements(4) //val wikiPageId = elements(5) val uriCount = elements(refCountField) if (pSfGivenUri.toDouble > pSfGivenUriThreshold) { f( (uri, LuceneConfig.Fields.SURFACE_FORM_KEYWORD, sf) ) } f( (uri, LuceneConfig.Fields.REFCOUNT, uriCount) ) } } } ================================================ FILE: src/main/scala/org/dbpedia/lookup/inputformat/WikiStatsExtractor.scala ================================================ package org.dbpedia.lookup.inputformat import java.io.InputStream import org.dbpedia.extraction.util.WikiUtil import org.dbpedia.lookup.lucene.LuceneConfig import scala.io.Source class WikiStatsExtractor(dataSet: InputStream, pSfGivenUriThreshold: Double) extends InputFormat { private val it = Source.fromInputStream(dataSet, "utf-8").getLines() override def foreach[U](f: ((String,String,String)) => U) { while(it.hasNext) { val elements = it.next().split("\t") if (elements.size >= 3) { val uri = WikiUtil.wikiEncode(elements(1)) val sf = elements(0) val uriCount = elements(2) f((uri, LuceneConfig.Fields.SURFACE_FORM_KEYWORD, sf)) f((uri, LuceneConfig.Fields.REFCOUNT, uriCount)) } } } } ================================================ FILE: src/main/scala/org/dbpedia/lookup/lucene/Indexer.scala ================================================ package org.dbpedia.lookup.lucene import org.apache.lucene.store.FSDirectory import org.apache.lucene.document.{Field, Document} import org.apache.lucene.index.{IndexReader, Term, IndexWriter} import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream import java.io.{FileInputStream, InputStream, File} import org.semanticweb.yars.nx.parser.NxParser import org.dbpedia.extraction.util.WikiUtil import org.dbpedia.lookup.inputformat.{WikiStatsExtractor, InputFormat, DBpediaNTriplesInputFormat, PignlprocTSVInputFormat} import org.apache.lucene.search.{IndexSearcher, TermQuery} import org.dbpedia.lookup.util.Logging /** * Indexes the lookup data to a Lucene directory. */ class Indexer(val indexDir: File) extends Logging { private val indexWriter = new IndexWriter(FSDirectory.open(indexDir), LuceneConfig.indexWriterConfig) indexWriter.commit() private val indexSearcher = new IndexSearcher(IndexReader.open(FSDirectory.open(indexDir))) logger.info("Directory "+indexDir+" opened for indexing") /** * Index a data file for the lookup service. */ def index(dataTraversable: InputFormat) { var count = 0 val collector = scala.collection.mutable.HashMap[String, scala.collection.mutable.HashMap[String, scala.collection.mutable.HashSet[String]]]() dataTraversable.foreach{ case (uri:String, field:String, value:String) => { val fields = collector.getOrElse(uri, scala.collection.mutable.HashMap[String, scala.collection.mutable.HashSet[String]]()) val values: scala.collection.mutable.HashSet[String] = fields.getOrElse(field, scala.collection.mutable.HashSet[String]()) values.add(value) fields.put(field, values) collector.put(uri, fields) count += 1 if(count%100000 == 0) { logger.info(count+" data points read") } if(count%LuceneConfig.commitAfterDataPointsNum == 0) { updateIndex(collector) collector.clear() } }} updateIndex(collector) logger.info(count+" data points indexed. Done") //TODO remove? logger.info("Optimizing") indexWriter.optimize() logger.info("Done optimizing") } private def updateIndex(collector:scala.collection.mutable.HashMap[String,scala.collection.mutable.HashMap[String, scala.collection.mutable.HashSet[String]]]) { logger.info("Updating") collector.foreach(t => { val (uri, fields) = t updateDataForUri(uri, fields) }) logger.info("Committing") indexWriter.commit() } private def updateDataForUri(currentUri: String, fieldCollector:scala.collection.mutable.HashMap[String, scala.collection.mutable.HashSet[String]]) { val uriTerm = new Term(LuceneConfig.Fields.URI, currentUri) val hits = indexSearcher.search(new TermQuery(uriTerm), 2) val doc = if (hits.scoreDocs.length == 1) { indexSearcher.doc(hits.scoreDocs(0).doc) } else if (hits.scoreDocs.length == 0) { val d = new Document updateField(d, new Field(LuceneConfig.Fields.URI, uriTerm.text, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO)) val label = WikiUtil.wikiDecode(uriTerm.text.replace("http://dbpedia.org/resource/", "")) updateField(d, new Field(LuceneConfig.Fields.SURFACE_FORM_KEYWORD, label, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO)) val prefixTerm = LuceneConfig.PrefixSearchPseudoAnalyzer.analyze(label) updateField(d, new Field(LuceneConfig.Fields.SURFACE_FORM_PREFIX, prefixTerm, Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO)) d } else { // if (hits.scoreDocs.length > 1) { throw new IllegalStateException("Given Term matches more than 1 document in the index.") } indexWriter.updateDocument(uriTerm, getUpdatedDocument(doc, uriTerm, fieldCollector)) } def close() { indexWriter.close() logger.info("Closed index "+indexDir) } private def getUpdatedDocument(doc: Document, uriTerm: Term, fields: scala.collection.Map[String, scala.collection.Set[String]]): Document = { for((field, valueSet) <- fields) { val addedPrefixTerms = new scala.collection.mutable.HashSet[String]() for(value <- valueSet) { if(field == LuceneConfig.Fields.SURFACE_FORM_KEYWORD) { updateField(doc, new Field(LuceneConfig.Fields.SURFACE_FORM_KEYWORD, value, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.NO)) val prefixTerm = LuceneConfig.PrefixSearchPseudoAnalyzer.analyze(value) if (!addedPrefixTerms.contains(prefixTerm)) { updateField(doc, new Field(LuceneConfig.Fields.SURFACE_FORM_PREFIX, prefixTerm, Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.NO)) addedPrefixTerms.add(prefixTerm) } } else { updateField(doc, new Field(field, value, Field.Store.YES, Field.Index.NOT_ANALYZED_NO_NORMS, Field.TermVector.NO)) } } } doc } private def updateField(doc: Document, field: Field) { doc.add(field) } } object Indexer extends Logging { private val pSfGivenUriThreshold = 0.001 /** * Index data to a directory. */ def main(args: Array[String]) { val indexDir = new File(args(0)) val redirectsFile = new File(args(1)) val data = args.drop(2) val indexer = new Indexer(indexDir) for(fileName <- data) { var in: InputStream = new FileInputStream(fileName) if (fileName.endsWith(".bz2")) { in = new BZip2CompressorInputStream(in) } logger.info("Indexing "+fileName) indexer.index(getDataInput(fileName, in, redirectsFile)) logger.info("Done Indexing "+fileName) } indexer.close() } private def getDataInput(fileName: String, inputStream: InputStream, redirectsFile: File) = { if (fileName.contains(".nt") || fileName.contains(".nq")|| fileName.contains(".ttl")) { logger.debug("using DBpediaNTriplesInputFormat") new DBpediaNTriplesInputFormat(inputStream, getRedirectUris(redirectsFile)) } else if (fileName.contains(".tsv")) { logger.debug("using PignlprocTSVInputFormat") val refCountField = if (fileName.contains("_alx")) 7 else 6 new PignlprocTSVInputFormat(inputStream, pSfGivenUriThreshold, refCountField=refCountField) } else if (fileName.contains("pairCounts")) { new WikiStatsExtractor(inputStream, pSfGivenUriThreshold) } else { throw new IllegalArgumentException("only know how to handle file types .nt, .nq and .tsv") } } private def getRedirectUris(redirectsFile: File): scala.collection.Set[String] = { val reds = new scala.collection.mutable.HashSet[String]() logger.info("Reading redirects from "+redirectsFile) val parser = new NxParser(new FileInputStream(redirectsFile)) while (parser.hasNext) { val triple = parser.next if(triple(1).toString != "http://dbpedia.org/ontology/wikiPageRedirects") { throw new Exception("predicate must be http://dbpedia.org/ontology/wikiPageRedirects; got "+triple(1).toString) } reds.add(triple(0).toString) } logger.info("Done reading redirects") reds } } ================================================ FILE: src/main/scala/org/dbpedia/lookup/lucene/LuceneConfig.scala ================================================ package org.dbpedia.lookup.lucene import org.apache.lucene.index.IndexWriterConfig import org.apache.lucene.util.Version import org.apache.lucene.analysis._ import java.io.{Reader, File} import standard.{StandardFilter, StandardAnalyzer} import org.apache.lucene.queryParser.QueryParser import org.dbpedia.lookup.util.Logging /** * Created by IntelliJ IDEA. * User: Max * Date: 14.01.11 * Time: 15:10 * Lucene configuration data. */ object LuceneConfig extends Logging { // default_index_path is not used any more // Overwrite existing directories when indexing (must be true if target directory does not exist) val overwriteExisting = true // number of data points to read in memory before updating the index val commitAfterDataPointsNum = 1500000 // Lucene Version val version = Version.LUCENE_36 // Analyzer for KeywordSearch val analyzer = new StandardAnalyzer(version, StopAnalyzer.ENGLISH_STOP_WORDS_SET) // index writer configuration val indexWriterConfig = new IndexWriterConfig(version, analyzer) //HACK!: Analyzer for PrefixSearch. The result is converted back to a string and indexed/search NOT_ANALYZED! object PrefixSearchPseudoAnalyzer { private val prefixSearchQueryParser = new QueryParser(version, Fields.SURFACE_FORM_KEYWORD, analyzer) def analyze(keyword: String) = { prefixSearchQueryParser.parse('"' + QueryParser.escape(keyword) + '"') .toString.replace(Fields.SURFACE_FORM_KEYWORD+":", "") .replaceFirst("^\"", "") .replaceFirst("\"$", "") .toLowerCase } } object Fields { val URI = "URI" val SURFACE_FORM_KEYWORD = "SURFACE_FORM_KEYWORD" val SURFACE_FORM_PREFIX = "SURFACE_FORM_PREFIX" val REFCOUNT = "REFCOUNT" val DESCRIPTION = "DESCRIPTION" val CLASS = "CLASS" val CATEGORY = "CATEGORY" val TEMPLATE = "TEMPLATE" val REDIRECT = "REDIRECT" } } ================================================ FILE: src/main/scala/org/dbpedia/lookup/lucene/Searcher.scala ================================================ package org.dbpedia.lookup.lucene import java.io.File import org.apache.lucene.store.FSDirectory import org.apache.lucene.search._ import org.apache.lucene.index.{Term, IndexReader} import org.dbpedia.extraction.util.WikiUtil import org.dbpedia.lookup.entities._ import org.apache.lucene.queryParser.QueryParser /** * Created by IntelliJ IDEA. * User: Max Jakob * Date: 14.01.11 * Time: 14:43 * Class to query the Lucene index for the best URI given a surface form. */ class Searcher(val indexDir: File) { private val indexReader = IndexReader.open(FSDirectory.open(indexDir)) private val indexSearcher = new IndexSearcher(indexReader) private val sort = new Sort(new SortField(LuceneConfig.Fields.REFCOUNT, SortField.INT, true)) private val queryParser = new QueryParser(LuceneConfig.version, LuceneConfig.Fields.SURFACE_FORM_KEYWORD, LuceneConfig.analyzer) def keywordSearch(keyword: String, ontologyClass: String="", maxResults: Int=5): List[Result] = { if(keyword == null || keyword.isEmpty) { return List.empty } val query = getQuery(keyword, ontologyClass, prefixQuery = false) search(query, maxResults) } def prefixSearch(keyword: String, ontologyClass: String="", maxResults: Int=5): List[Result] = { if(keyword == null || keyword.isEmpty) { return List.empty } val query = getQuery(keyword, ontologyClass, prefixQuery = true) search(query, maxResults) } def close() { indexSearcher.close() indexReader.close() } private def search(query: Query, maxResults: Int): List[Result] = { indexSearcher.search(query, null, maxResults, sort).scoreDocs.toList.map(getResult) } private def getQuery(keyword: String, ontologyClass: String, prefixQuery: Boolean = false): Query = { val bq = new BooleanQuery val decodedKeyword = WikiUtil.wikiDecode(keyword) if(prefixQuery) { val pseudoAnalyzedKeyword = LuceneConfig.PrefixSearchPseudoAnalyzer.analyze(decodedKeyword) val prefixQuery = new PrefixQuery(new Term(LuceneConfig.Fields.SURFACE_FORM_PREFIX, pseudoAnalyzedKeyword)) bq.add(prefixQuery, BooleanClause.Occur.MUST) } else { val escapedKeyword = QueryParser.escape(decodedKeyword) val phraseQuery = synchronized { // query parser is not thread safe! queryParser.parse('"' + escapedKeyword + '"') //quotes keep word order } bq.add(phraseQuery, BooleanClause.Occur.MUST) } getOntologyClassQuery(ontologyClass) match { case Some(q: Query) => bq.add(q, BooleanClause.Occur.MUST) case _ => } bq } private def getOntologyClassQuery(ontologyClass: String): Option[Query] = { if(ontologyClass == null || ontologyClass.trim == "") { None } else if(ontologyClass.toLowerCase == "thing" || ontologyClass.toLowerCase.endsWith("#thing")) { None } else { val ontologyPrefix = "http://dbpedia.org/ontology/" //is full class URI if(ontologyClass startsWith ontologyPrefix) { Some(new TermQuery(new Term(LuceneConfig.Fields.CLASS, ontologyClass.trim))) } //abbreviated namespace prefix else if(ontologyClass.startsWith("dbpedia:") || ontologyClass.startsWith("dbpedia-owl:")) { val c = ontologyClass.trim.replace("dbpedia:", "").replace("dbpedia-owl:", "") Some(new TermQuery(new Term(LuceneConfig.Fields.CLASS, ontologyPrefix+c))) } //label given: make camel case and attach namespace else { val camel = ontologyClass.trim.split(" ").map(_.capitalize).mkString("") Some(new TermQuery(new Term(LuceneConfig.Fields.CLASS, ontologyPrefix+camel))) } } } private def getResult(scoreDoc: ScoreDoc): Result = { val doc = indexReader.document(scoreDoc.doc) val uri: String = doc.get(LuceneConfig.Fields.URI) val description: String = doc.get(LuceneConfig.Fields.DESCRIPTION) val ontologyClasses: Set[OntologyClass] = doc.getValues(LuceneConfig.Fields.CLASS) match { case null => Set.empty case classes => classes.map(uri => new OntologyClass(uri)).toSet } val categories: Set[Category] = doc.getValues(LuceneConfig.Fields.CATEGORY) match { case null => Set.empty case cats => cats.map(uri => new Category(uri)).toSet } val templates: Set[Template] = doc.getValues(LuceneConfig.Fields.TEMPLATE) match { case null => Set.empty case temps => temps.map(uri => new Template(uri)).toSet } val redirects: Set[Redirect] = doc.getValues(LuceneConfig.Fields.REDIRECT) match { case null => Set.empty case reds => reds.map(uri => new Redirect(uri)).toSet } val refCount: Int = doc.get(LuceneConfig.Fields.REFCOUNT) match { case null => 0 case count: String => count.toInt } new Result(uri, description, ontologyClasses, categories, templates, redirects, refCount) } } ================================================ FILE: src/main/scala/org/dbpedia/lookup/server/LookupResource.scala ================================================ package org.dbpedia.lookup.server import javax.ws.rs._ import javax.ws.rs.core.Context import core.Response import org.dbpedia.lookup.entities._ import org.dbpedia.lookup.lucene.Searcher import org.dbpedia.lookup.util.Logging /** * Controller for DBpedia Lookup web service. */ @Path("/api/search{ext:(.asmx)?}") @Produces(Array("application/xml", "application/json")) class LookupResource extends Logging { @Context var searcher : Searcher = _ @DefaultValue("") @HeaderParam("accept") var accept : String = _ @DefaultValue("") @QueryParam("QueryString") var query : String = _ @DefaultValue("") @QueryParam("QueryClass") var ontologyClass : String = _ @DefaultValue("5") @QueryParam("MaxHits") var maxHits : Int = _ @GET @Path("/KeywordSearch") def keywordSearch : Response = { val results = searcher.keywordSearch(query, ontologyClass, maxHits) logger.info("KeywordSearch found "+results.length+": MaxHits="+maxHits.toString+" QueryClass="+ontologyClass+" QueryString="+query) ok(results) } @GET @Path("/PrefixSearch") def prefixSearch : Response = { val results = searcher.prefixSearch(query, ontologyClass, maxHits) logger.info("PrefixSearch found "+results.length+": MaxHits="+maxHits.toString+" QueryClass="+ontologyClass+" QueryString="+query) ok(results) } // Sets the necessary headers in order to enable CORS private def ok(results: List[Result]): Response = { Response.ok().entity(serialize(results)).header("Access-Control-Allow-Origin", "*").build() } private def serialize(results: List[Result]): String = { val serializer = (accept contains "application/json") match { case true => new ResultJsonSerializer case _ => new ResultXmlSerializer } serializer.prettyPrint(results) } } ================================================ FILE: src/main/scala/org/dbpedia/lookup/server/Server.scala ================================================ package org.dbpedia.lookup.server import com.sun.jersey.api.container.httpserver.HttpServerFactory import com.sun.jersey.api.core.ClassNamesResourceConfig import com.sun.jersey.spi.inject.SingletonTypeInjectableProvider import javax.ws.rs.core.Context import java.net.URI import org.dbpedia.lookup.lucene.Searcher import java.io.File import org.dbpedia.lookup.util.Logging /** * Created by IntelliJ IDEA. * User: Max * Date: 17.01.11 * Time: 13:48 * DBpedia Lookup Server */ class SearcherProvider(searcher: Searcher) extends SingletonTypeInjectableProvider[Context, Searcher](classOf[Searcher], searcher) class Server(port: Int, searcher: Searcher) { val resources = { val config = new ClassNamesResourceConfig(classOf[LookupResource]) config.getSingletons.add(new SearcherProvider(searcher)) config } val serverUri = new URI("http://localhost:" + port.toString + "/") val server = HttpServerFactory.create(serverUri, resources) def start() { server.start() } def stop() { server.stop(0) } } object Server extends Logging { @volatile private var running = true def main(args : Array[String]) { val indexDir = new File(args(0)) val port = System.getProperty("http.port", "1111").toInt val server = new Server(port, new Searcher(indexDir)) server.start() val baseUri = server.serverUri.toString logger.info("Server started in " + System.getProperty("user.dir") + " listening on " + baseUri) while(running) { Thread.sleep(100) } //Stop the HTTP server server.stop() } } ================================================ FILE: src/main/scala/org/dbpedia/lookup/util/Logging.scala ================================================ package org.dbpedia.lookup.util import org.slf4j.LoggerFactory trait Logging { protected val logger = LoggerFactory.getLogger(getClass.getName) } ================================================ FILE: src/test/resources/data.nt ================================================ "Berlin is the capital city of Germany and one of the 16 states of Germany. With a population of 3.5\u00A0million people, Berlin is Germany's largest city and is the second most populous city proper and the eighth most populous urban area in the European Union. Located in northeastern Germany, it is the center of the Berlin-Brandenburg Metropolitan Region, which has 5.9\u00A0million residents from over 190 nations. Located in the European Plains, Berlin is influenced by a temperate seasonal climate."@en . . . . . . . . . . . . . . . . . . "Berlin"@en . "capital of Germany"@en . "100"^^ . "Beirut is the capital and largest city of Lebanon. As there has been no recent population census, the exact population is unknown; estimates in 2007 ranged from slightly less than 1\u00A0million to slightly more than 2\u00A0million. Located on a peninsula at the midpoint of Lebanon's Mediterranean coast, it serves as the country's largest and main seaport. The Beirut metropolitan area consists of the city and its suburbs."@en . . . . . . . . . . . . . . . . . . . . . . . . . . . . "Beirut"@en . "largest city of Lebanon"@en . "99"^^ . "Beirut is an American band which was originally the solo musical project of Santa Fe native Zachary Francis Condon, and later expanded into a band. The band's first performances were in New York, in May 2006, to support the release of their debut album, Gulag Orkestar. Beirut's music combines elements of indie-rock and world music."@en . . . . . . . . . "Beirut"@en . "American band"@en . "98"^^ . ================================================ FILE: src/test/resources/logback-test.xml ================================================ ================================================ FILE: src/test/resources/redirects.nt ================================================ . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . ================================================ FILE: src/test/scala/org/dbpedia/lookup/IntegrationTest.scala ================================================ package org.dbpedia.lookup import org.dbpedia.lookup.lucene._ import org.dbpedia.lookup.server._ import com.sun.jersey.api.client._ import net.liftweb.json._ import scala.xml._ import org.scalatest.FunSuite import org.scalatest.BeforeAndAfterAll /** * Full stack test * * 1. build index * 2. Start server * 3. Interrogate server via Jersey client over HTTP * 4. Kill server, delete index */ class IntegrationTest extends FunSuite with BeforeAndAfterAll { val tmpDir = TestUtils.tempDirectory val port = TestUtils.tempPort var server : Server = _ override def beforeAll() { Indexer.main(Array(tmpDir.toString, "src/test/resources/redirects.nt", "src/test/resources/data.nt")) server = new Server(port, new Searcher(tmpDir)) server.start } override def afterAll() { server.stop tmpDir.delete } def get(path: String, accepts: String = "application/xml") = { val client = new Client val resource = client.resource("http://localhost:" + port.toString + path) resource.accept(accepts).get(classOf[ClientResponse]) } test("KeywordSearch works") { val body = get("/api/search/KeywordSearch?QueryString=Beirut").getEntity(classOf[String]) val xml = XML.loadString(body) assert((xml \ "Result" \ "Label").head.text == "Beirut") assert((xml \ "Result" \ "Label").tail.head.text == "Beirut (band)") assert((xml \ "Result").size == 2) } test("PrefixSearch works") { val body = get("/api/search/PrefixSearch?QueryString=berl").getEntity(classOf[String]) val xml = XML.loadString(body) assert((xml \ "Result" \ "Label").head.text == "Berlin") assert((xml \ "Result").size == 1) } test("QueryClass works") { val body = get("/api/search/KeywordSearch?QueryClass=place&QueryString=Beirut").getEntity(classOf[String]) val xml = XML.loadString(body) assert((xml \ "Result" \ "Label").head.text == "Beirut") assert((xml \ "Result").size == 1) } test("MaxHits works") { val body = get("/api/search/KeywordSearch?MaxHits=1&QueryString=beirut").getEntity(classOf[String]) val xml = XML.loadString(body) assert((xml \ "Result").size == 1) } test("legacy .asmx in url is optional") { assert(get("/api/search.asmx/KeywordSearch").getStatus == 200) assert(get("/api/search/KeywordSearch").getStatus == 200) } test("json results are returned when correct accepts header given") { val response = get("/api/search/KeywordSearch", "application/json") assert(response.getType.toString == "application/json") assert(parse(response.getEntity(classOf[String])) \\ "results" == JArray(List())) } } ================================================ FILE: src/test/scala/org/dbpedia/lookup/TestUtils.scala ================================================ package org.dbpedia.lookup import org.dbpedia.lookup.lucene._ import org.dbpedia.lookup.server._ import java.io.File object TestUtils { def tempDirectory : File = { val file = File.createTempFile("lookup", "") file.delete file.mkdir file } def tempPort : Int = new java.net.ServerSocket(0).getLocalPort } ================================================ FILE: src/test/scala/org/dbpedia/lookup/entities/EntitiesSerializationTest.scala ================================================ package org.dbpedia.lookup.entities import org.scalatest.FunSuite import net.liftweb.json._ trait SerializationTest extends FunSuite { val template = new Template("http://en.wikipedia.org/wiki/Template:Infobox") val redirect = new Redirect("http://en.wikipedia.org/wiki/A_page") val klass = new OntologyClass("http://dbpedia.org/ontology/City") val category = new Category("http://dbpedia.org/resource/Category:Berlin") val result = new Result( "http://dbpedia.org/resource/Berlin", "Berlin is the capital city of Germany & <> ...", Set(klass), Set(category), Set(template), Set(redirect), 100 ) } class EntitiesJsonSerializationTest extends SerializationTest { val serializer = new ResultJsonSerializer test("a list of result entities should serialize to json correctly") { implicit val formats = net.liftweb.json.DefaultFormats val json = serializer.prettyPrint(List(result, result)) val data = Serialization.read[Map[String, List[Result]]](json) assert(data("results").size == 2) assert(data("results").head == result) } } class EntitiesXmlSerializationTest extends SerializationTest { val serializer = new ResultXmlSerializer test("a list of result entities should serialize to xml correctly") { val xml = serializer.serialize(List(result, result)) assert((xml \ "Result").size == 2) } test("the result entity should serialize to XML correctly") { val xml = serializer.serialize(result) assert((xml \ "Label").text == result.label) assert((xml \ "URI").text == result.uri) assert((xml \ "Description").text == result.description) assert((xml \ "Refcount").text == result.refCount.toString) assert((xml \ "Classes" \ "Class" \ "URI").text == result.classes.head.uri) assert((xml \ "Classes" \ "Class" \ "Label").text == result.classes.head.label) assert((xml \ "Categories" \ "Category" \ "URI").text == result.categories.head.uri) assert((xml \ "Categories" \ "Category" \ "Label").text == result.categories.head.label) assert((xml \ "Templates" \ "Template" \ "URI").text == result.templates.head.uri) assert((xml \ "Redirects" \ "Redirect" \ "URI").text == result.redirects.head.uri) } } ================================================ FILE: src/test/scala/org/dbpedia/lookup/entities/EntitiesTest.scala ================================================ package org.dbpedia.lookup.entities import org.scalatest.FunSuite class EntitiesTest extends FunSuite { test("category entity has correct label for uri") { val category = new Category("http://dbpedia.org/resource/Category:Berlin") assert(category.label == "Berlin", "category label incorrect") } test("class entity has correct label for uri") { val klass = new OntologyClass("http://dbpedia.org/ontology/City") assert(klass.label == "city", "class label incorrect") } test("class entity has correct label for owl#Thing") { val klass = new OntologyClass("http://www.w3.org/2002/07/owl#Thing") assert(klass.label == "owl#Thing", "class label incorrect") } test("result entity has correct label for uri") { val result = new Result( "http://dbpedia.org/resource/Berlin", "Some description ...", Set[OntologyClass](), Set[Category](), Set[Template](), Set[Redirect](), 100 ) assert(result.label == "Berlin", "result label incorrect") } }