diff --git a/.gitignore b/.gitignore new file mode 100755 index 0000000..dbfe783 --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +bin +build +.gradle +.ideaout/ +.idea/ +.settings/ +*.iml +.project +.classpath diff --git a/LICENSE b/LICENSE new file mode 100755 index 0000000..5eeca62 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT LICENSE + +Copyright (c) 2019 Webis group + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/README.md b/README.md new file mode 100755 index 0000000..2e466b0 --- /dev/null +++ b/README.md @@ -0,0 +1,174 @@ +# Netspeak 4 indexing + +This project contains all necessities to create a new Netspeak 4 index. + +This project is mainly intended for developers that want to build a new Netspeak 4 index from a given data set. + + +--- + +## Contributors + +Michael Schmidt (2018 - 2020) + +Martin Trenkmann (2008 - 2013) + +Martin Potthast (2008 - 2020) + +Benno Stein (2008 - 2020) + + + +--- + +# Old Notes + +% NETSPEAK 4 JAVA NOTES +% martin.trenkmann@uni-weimar.de +% November 22, 2013 + + + +Notation +-------- + + # Does need admin permissions (sudo). + $ Does not need admin permissions. + + +Project description +------------------- + + + + +Library dependencies +-------------------- + +This Java project is a language binding for the C++ project netspeak4-application-cpp whose +implementation comes in form of a shared library (.so file). The present Java +application loads the library at runtime and invokes their native routines via +the Java Native Interface (JNI) method. Precompiled libraries for Ubuntu 10.04 +and 12.04 can be found in the lib sub-directory of this project. The native +library itself has some dependencies you need to install as well. To do so run +the following script: + + # /build/install-dependencies.sh + + +Build and install the native library +------------------------------------ + +In the case that there is no precompiled native library available for your +platform, you need to compile the corresponding C++ project by yourself. + +- Checkout netspeak4-application-cpp from webis CVS. +- Build target "Library" with Qt Creator IDE. + +# cp /lib//.so /usr/lib + + +Load native library +------------------- + +Set "-Djava.library.path=/usr/lib" as VM argument. + + +Build Netspeak from n-gram collection +------------------------------------- + +To build Netspeak from a collection of n-grams you have to provide a dedicated +directory with one or more text files as input. Each of these files have to +list a number of n-grams together with their frequencies, one by line. The +format of a single line is defined as follows: + + word_1 SPACE word_2 SPACE ... word_n TAB frequency + +In words: Each line defines an n-gram with its frequency. The delimiter between +the n-gram and the frequency is a single tabulator ('\t'). The delimiter to +separate the n-gram's words is a single whitespace (' '). + +Note: Follow this specification strictly to prevent parsing errors. In +particular, ensure the single `\t` delimiter between n-gram and frequency. + + +Getting Started +--------------- + +- `usage.NetspeakBuilderUsage.java` shows how to build Netspeak from a + collection of n-grams. +- `usage.NetspeakTerminal.java` runs a simple command line to search a Netspeak + instance interactively for testing purposes. +- `usage.NetspeakUsage.java` demonstrates how to search Netspeak in more detail + using the Request and Response objects. + +In some cases, if your local hardware, storage space or operating system +(Netspeak runs only on Linux) does not fit, it might be necessary to setup +Netspeak running on a Linux server and to request that instance remotely. + +For that reason build your Netspeak application as usual and run it as a Java +servlet, e.g. with Tomcat, using the project `netspeak4-server`. A running +Netspeak server can then be requested with `netspeak3-client-java` project from +any Java application. + + +Netspeak query language +----------------------- + +The Netspeak query syntax as described here should be used as reference. There +might be other syntax information out there, e.g. at netspeak.org, which +provides some syntactical simplifications in form of easier to use wildcards or +operators. However, these modified syntaxes are just front-ends and do not work +with the original Netspeak interface. Here is the truth: + + ? is a placeholder for exactly one word and can be sequenced to search for + exaclty two, three, four ... words. + + Example: how to ? this + -> how to use this + -> how to do this + -> how to cite this + + * is a placeholder for zero or many words. + + Example: see * works + -> see how it works + -> see if it works + -> see what works + + [] compares options, i.e. it checks each word or phrase between these + brackets plus the so called empty word at that position in the query. + + Example: it's [ great well "so good" ] + -> it's + -> it's great + -> it's well + -> it's so good + + {} checks the order, i.e. it tries to find each permutation of the given + sequence of words or phrases at that position in the query. + + Example: for { "very important people" only } + -> for very important people only + -> for only very important people + + # searches for alternatives of the word following. This operator requests + the optional Netspeak hash-dictionary component and uses [] to compare + each retrieved alternative (except that the empty word is not checked). + The mapping from word to alternatives is completely up to the user when + building Netspeak, for netspeak.org we use this operator for a synonym + search providing the Wordnet dictionary. + + Example: waiting for #response + -> waiting for response + -> waiting for answer + -> waiting for reply + +You can combine the introduced wildcards and operators as you want, but with the +exception that you may not place any wildcard within bracket operators. Also +nested brackets are not allowed. As you can see in the examples above you can +quote phrases to be handled as one entity is `[]` and `{}`. + + + +% Compile via: pandoc from.txt > to.html diff --git a/artifactory.gradle b/artifactory.gradle new file mode 100755 index 0000000..ff96511 --- /dev/null +++ b/artifactory.gradle @@ -0,0 +1,92 @@ +// Fetch Artifactory publishing plugin +buildscript { + repositories { + jcenter() + } + dependencies { + classpath 'org.jfrog.buildinfo:build-info-extractor-gradle:4+' + } +} + +// Apply plugins +apply plugin: 'maven-publish' +apply plugin: org.jfrog.gradle.plugin.artifactory.ArtifactoryPlugin + +// Determine which repositories to pull from and publish to +def pullRelease = 'libs-release' +def pullSnapshot = 'libs-snapshot' +def pushRelease = 'libs-snapshot-webis-gradle' +def pushSnapshot = 'libs-release-webis-gradle' + +if (project.ext.has("nonFree") && project.ext.get("nonFree")) { + pullRelease += '-nonfree' + pullSnapshot += '-nonfree' + pushRelease += '-nonfree' + pushSnapshot += '-nonfree' +} + +repositories { + maven { + url = 'https://repo.webis.de/artifactory/' + pullRelease + credentials { + username = project.findProperty("artifactoryUsername") ?: "" + password = project.findProperty("artifactoryPassword") ?: "" + } + } + maven { + url = 'https://repo.webis.de/artifactory/' + pullSnapshot + credentials { + username = project.findProperty("artifactoryUsername") ?: "" + password = project.findProperty("artifactoryPassword") ?: "" + } + } +} + +// Configure Artifactory remote +artifactory { + contextUrl = "https://repo.webis.de/artifactory" + publish { + repository { + repoKey = version.endsWith('SNAPSHOT') ? pushRelease : pushSnapshot + username = project.findProperty("artifactoryUsername") ?: "" + password = project.findProperty("artifactoryPassword") ?: "" + maven = true + } + defaults { + publications('mavenJava') + } + } +} + +// Create tasks for generating source and JavaDoc JARs +task sourcesJar(type: Jar, dependsOn: classes) { + classifier = 'sources' + from sourceSets.main.allSource +} + +task javadocJar(type: Jar, dependsOn: javadoc) { + classifier = 'javadoc' + from javadoc.destinationDir +} + +artifacts { + archives javadocJar + archives sourcesJar +} + +// Configure Maven Publishing Information +publishing { + publications { + mavenJava(MavenPublication) { + // Publish binary, source, and JavaDoc JARs + from components.java + artifact sourcesJar + artifact javadocJar + + // Set POM definition + if (project.ext.has("pomDef")) { + pom project.ext.get("pomDef") + } + } + } +} diff --git a/build.gradle b/build.gradle new file mode 100755 index 0000000..339432d --- /dev/null +++ b/build.gradle @@ -0,0 +1,53 @@ +// Apply plugins +apply plugin: 'java' +apply plugin: 'jacoco' +apply plugin: 'application' + +// Basic configuration and settings for all (sub-)projects +allprojects { + group = 'org.netspeak' + version = '1.0' + mainClassName = 'org.netspeak.usage.NetspeakTerminal' + sourceCompatibility = 1.8 + targetCompatibility = 1.8 + + // Set source file encoding + compileJava.options.encoding = "UTF-8" + compileTestJava.options.encoding = "UTF-8" + javadoc.options.encoding = 'UTF-8' + + // Declare global dependencies + dependencies { + compile group: 'org.netspeak', name: 'netspeak4-application-java', version: '1.0' + compile group: 'org.apache.commons', name: 'commons-compress', version: '1.19' + + testImplementation 'junit:junit:4.12' + } + + // Set MANIFEST.MF contents + jar { + manifest { + attributes('Main-Class': mainClassName) + } + } +} + +// Set POM definition +project.ext.pomDef = { + name = 'Netspeak 4 stuff' + description = 'An application with lots of miscellaneous functionality related to Netspeak 4' + url = 'http://netspeak.org' + //licenses { + // license { + // name = 'The Apache License, Version 2.0' + // url = 'http://www.apache.org/licenses/LICENSE-2.0.txt' + // } + //} + organization { + name = 'Netspeak' + url = 'http://netspeak.org' + } +} + +// Include Artifactory configuration +apply from: 'artifactory.gradle' diff --git a/gradle/wrapper/gradle-wrapper.jar b/gradle/wrapper/gradle-wrapper.jar new file mode 100755 index 0000000..0d4a951 Binary files /dev/null and b/gradle/wrapper/gradle-wrapper.jar differ diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties new file mode 100755 index 0000000..37e7bf4 --- /dev/null +++ b/gradle/wrapper/gradle-wrapper.properties @@ -0,0 +1,6 @@ +#Thu Aug 02 10:12:04 CEST 2018 +distributionBase=GRADLE_USER_HOME +distributionPath=wrapper/dists +zipStoreBase=GRADLE_USER_HOME +zipStorePath=wrapper/dists +distributionUrl=https\://services.gradle.org/distributions/gradle-4.9-all.zip diff --git a/gradlew b/gradlew new file mode 100755 index 0000000..cccdd3d --- /dev/null +++ b/gradlew @@ -0,0 +1,172 @@ +#!/usr/bin/env sh + +############################################################################## +## +## Gradle start up script for UN*X +## +############################################################################## + +# Attempt to set APP_HOME +# Resolve links: $0 may be a link +PRG="$0" +# Need this for relative symlinks. +while [ -h "$PRG" ] ; do + ls=`ls -ld "$PRG"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '/.*' > /dev/null; then + PRG="$link" + else + PRG=`dirname "$PRG"`"/$link" + fi +done +SAVED="`pwd`" +cd "`dirname \"$PRG\"`/" >/dev/null +APP_HOME="`pwd -P`" +cd "$SAVED" >/dev/null + +APP_NAME="Gradle" +APP_BASE_NAME=`basename "$0"` + +# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +DEFAULT_JVM_OPTS="" + +# Use the maximum available, or set MAX_FD != -1 to use that value. +MAX_FD="maximum" + +warn () { + echo "$*" +} + +die () { + echo + echo "$*" + echo + exit 1 +} + +# OS specific support (must be 'true' or 'false'). +cygwin=false +msys=false +darwin=false +nonstop=false +case "`uname`" in + CYGWIN* ) + cygwin=true + ;; + Darwin* ) + darwin=true + ;; + MINGW* ) + msys=true + ;; + NONSTOP* ) + nonstop=true + ;; +esac + +CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar + +# Determine the Java command to use to start the JVM. +if [ -n "$JAVA_HOME" ] ; then + if [ -x "$JAVA_HOME/jre/sh/java" ] ; then + # IBM's JDK on AIX uses strange locations for the executables + JAVACMD="$JAVA_HOME/jre/sh/java" + else + JAVACMD="$JAVA_HOME/bin/java" + fi + if [ ! -x "$JAVACMD" ] ; then + die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." + fi +else + JAVACMD="java" + which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." +fi + +# Increase the maximum file descriptors if we can. +if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then + MAX_FD_LIMIT=`ulimit -H -n` + if [ $? -eq 0 ] ; then + if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then + MAX_FD="$MAX_FD_LIMIT" + fi + ulimit -n $MAX_FD + if [ $? -ne 0 ] ; then + warn "Could not set maximum file descriptor limit: $MAX_FD" + fi + else + warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" + fi +fi + +# For Darwin, add options to specify how the application appears in the dock +if $darwin; then + GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" +fi + +# For Cygwin, switch paths to Windows format before running java +if $cygwin ; then + APP_HOME=`cygpath --path --mixed "$APP_HOME"` + CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` + JAVACMD=`cygpath --unix "$JAVACMD"` + + # We build the pattern for arguments to be converted via cygpath + ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` + SEP="" + for dir in $ROOTDIRSRAW ; do + ROOTDIRS="$ROOTDIRS$SEP$dir" + SEP="|" + done + OURCYGPATTERN="(^($ROOTDIRS))" + # Add a user-defined pattern to the cygpath arguments + if [ "$GRADLE_CYGPATTERN" != "" ] ; then + OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" + fi + # Now convert the arguments - kludge to limit ourselves to /bin/sh + i=0 + for arg in "$@" ; do + CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` + CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option + + if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition + eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` + else + eval `echo args$i`="\"$arg\"" + fi + i=$((i+1)) + done + case $i in + (0) set -- ;; + (1) set -- "$args0" ;; + (2) set -- "$args0" "$args1" ;; + (3) set -- "$args0" "$args1" "$args2" ;; + (4) set -- "$args0" "$args1" "$args2" "$args3" ;; + (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; + (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; + (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; + (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; + (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; + esac +fi + +# Escape application args +save () { + for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done + echo " " +} +APP_ARGS=$(save "$@") + +# Collect all arguments for the java command, following the shell quoting and substitution rules +eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" + +# by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong +if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then + cd "$(dirname "$0")" +fi + +exec "$JAVACMD" "$@" diff --git a/gradlew.bat b/gradlew.bat new file mode 100755 index 0000000..f955316 --- /dev/null +++ b/gradlew.bat @@ -0,0 +1,84 @@ +@if "%DEBUG%" == "" @echo off +@rem ########################################################################## +@rem +@rem Gradle startup script for Windows +@rem +@rem ########################################################################## + +@rem Set local scope for the variables with windows NT shell +if "%OS%"=="Windows_NT" setlocal + +set DIRNAME=%~dp0 +if "%DIRNAME%" == "" set DIRNAME=. +set APP_BASE_NAME=%~n0 +set APP_HOME=%DIRNAME% + +@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +set DEFAULT_JVM_OPTS= + +@rem Find java.exe +if defined JAVA_HOME goto findJavaFromJavaHome + +set JAVA_EXE=java.exe +%JAVA_EXE% -version >NUL 2>&1 +if "%ERRORLEVEL%" == "0" goto init + +echo. +echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:findJavaFromJavaHome +set JAVA_HOME=%JAVA_HOME:"=% +set JAVA_EXE=%JAVA_HOME%/bin/java.exe + +if exist "%JAVA_EXE%" goto init + +echo. +echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:init +@rem Get command-line arguments, handling Windows variants + +if not "%OS%" == "Windows_NT" goto win9xME_args + +:win9xME_args +@rem Slurp the command line arguments. +set CMD_LINE_ARGS= +set _SKIP=2 + +:win9xME_args_slurp +if "x%~1" == "x" goto execute + +set CMD_LINE_ARGS=%* + +:execute +@rem Setup the command line + +set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar + +@rem Execute Gradle +"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% + +:end +@rem End local scope for the variables with windows NT shell +if "%ERRORLEVEL%"=="0" goto mainEnd + +:fail +rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of +rem the _cmd.exe /c_ return code! +if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 +exit /b 1 + +:mainEnd +if "%OS%"=="Windows_NT" endlocal + +:omega diff --git a/src/main/java/org/netspeak/Util.java b/src/main/java/org/netspeak/Util.java new file mode 100755 index 0000000..5b01a05 --- /dev/null +++ b/src/main/java/org/netspeak/Util.java @@ -0,0 +1,238 @@ +package org.netspeak; + +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.nio.charset.StandardCharsets; +import java.nio.file.FileVisitResult; +import java.nio.file.Files; +import java.nio.file.LinkOption; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.SimpleFileVisitor; +import java.nio.file.attribute.BasicFileAttributes; +import java.util.ArrayList; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; +import java.util.function.Consumer; +import java.util.function.Function; +import java.util.function.Supplier; +import java.util.regex.MatchResult; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import static java.util.Objects.requireNonNull; + +public final class Util { + + private Util() { + } + + /** + * Deletes the given file or directory. + *

+ * System links will not be followed. This will throw for non-empty directories. This operation will do nothing if + * the given path does not exist. + * + * @param dirOrFile + * @throws IOException + */ + public static void delete(Path dirOrFile) throws IOException { + delete(dirOrFile, false); + } + + /** + * Deletes the given file or directory (recursively). + *

+ * System links will not be followed. This will throw for non-empty directories if not recursive. This operation + * will do nothing if the given path does not exist. + * + * @param dirOrFile + * @throws IOException + */ + public static void delete(Path dirOrFile, boolean recursive) throws IOException { + if (!recursive) { + Files.deleteIfExists(dirOrFile); + } else { + if (!Files.exists(dirOrFile, LinkOption.NOFOLLOW_LINKS)) { + return; + } + Files.walkFileTree(dirOrFile, new SimpleFileVisitor() { + @Override + public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { + Files.delete(file); + return FileVisitResult.CONTINUE; + } + + @Override + public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException { + Files.delete(dir); + return FileVisitResult.CONTINUE; + } + }); + } + } + + public static void createEmptyDirectory(Path dir) throws IOException { + requireNonNull(dir); + if (Files.isDirectory(dir)) { + if (Files.newDirectoryStream(dir).iterator().hasNext()) { + throw new AssertionError("Is not empty " + dir); + } + } else { + Files.createDirectories(dir); + } + } + + public static List getAll(Iterable> futures) throws InterruptedException, ExecutionException { + List values = new ArrayList<>(); + for (Future f : futures) { + values.add(f.get()); + } + return values; + } + + /** + * Returns the + * + * @param path + * @return + * @throws IOException + */ + public static Set readWordList(Path path) throws IOException { + try (FileInputStream fileIn = new FileInputStream(path.toFile()); + Reader in = new InputStreamReader(fileIn, StandardCharsets.UTF_8)) { + return readWordList(in); + } + } + + public static Set readWordList(Reader in) throws IOException { + try (BufferedReader bufferedReader = new BufferedReader(in)) { + Set set = new LinkedHashSet<>(); + + bufferedReader.lines().forEach(word -> { + if (word == null || word.isEmpty()) + return; + + word = word.trim(); + if (!word.isEmpty()) { + set.add(word); + } + }); + + return set; + } + } + + public static Set readWordList(String path) throws IOException { + return readWordList(Paths.get(path)); + } + + public static Set readResourceWordList(String name) throws IOException { + try (InputStream input = Util.class.getResourceAsStream(name); + Reader in = new InputStreamReader(input, StandardCharsets.UTF_8)) { + return readWordList(in); + } + } + + public static String toPhrase(String[] words) { + if (words.length == 1) + return words[0]; + + StringBuilder sb = new StringBuilder(); + sb.append(words[0]); + + for (int i = 1; i < words.length; i++) { + sb.append(' '); + sb.append(words[i]); + } + + return sb.toString(); + } + + + /** + * Replaces all occurrences of the given pattern in the given string with the string returned by the replacer + * function. + * + * @param pattern + * @param string + * @param replacer + * @return + */ + public static String replaceAll(Pattern pattern, String string, Function replacer) { + Matcher matcher = pattern.matcher(string); + + requireNonNull(replacer); + boolean result = matcher.find(); + if (result) { + StringBuilder sb = new StringBuilder(); + int last; + do { + String replacement = replacer.apply(matcher); + sb.append(replacement); + last = matcher.end(); + result = matcher.find(); + } while (result); + sb.append(string, last, string.length()); + return sb.toString(); + } + return string; + } + + + public interface ThrowsRunnable extends Runnable { + + void runThrowing() throws Exception; + + @Override + default void run() { + try { + runThrowing(); + } catch (RuntimeException e) { + throw e; + } catch (Exception e) { + throw new RuntimeException(e); + } + } + } + + public interface ThrowsConsumer extends Consumer { + + void acceptThrowing(T t) throws Exception; + + @Override + default void accept(T t) { + try { + acceptThrowing(t); + } catch (RuntimeException e) { + throw e; + } catch (Exception e) { + throw new RuntimeException(e); + } + } + } + + public interface ThrowsSupplier extends Supplier { + + T getThrowing() throws Exception; + + @Override + default T get() { + try { + return getThrowing(); + } catch (RuntimeException e) { + throw e; + } catch (Exception e) { + throw new RuntimeException(e); + } + } + } + +} diff --git a/src/main/java/org/netspeak/io/GoogleBooksCsvReader.java b/src/main/java/org/netspeak/io/GoogleBooksCsvReader.java new file mode 100644 index 0000000..1c15dee --- /dev/null +++ b/src/main/java/org/netspeak/io/GoogleBooksCsvReader.java @@ -0,0 +1,130 @@ +package org.netspeak.io; + +import java.io.BufferedReader; +import java.io.IOException; + +/** + * A reader for Google books CSV files. + *

+ * These files are a bit difficult to parse because the n-grams are also + * separated by years. So there will be many consecutive occurrences of the same + * phrase but for different years. This reader will automatically parse and + * aggregate these entries. + *

+ * Example: + * + *

+ * collision such    2000    4     4     4
+ * collision such    2001    6     6     6
+ * collision such    2002    6     6     6
+ * collision such    2003    10    11    0
+ * collision such    2004    17    11    5
+ * collision such    2005    14    11    3
+ * collision such    2006    20    22    0
+ * collision such    2007    17    11    7
+ * collision such    2008    19    11    8
+ * 
+ * + * These will all be parsed and aggregated into: + * + *
+ * 
+ * String phrase  = "collision such";
+ * long frequency = 113;
+ * 
+ * 
+ * + * @author Michael Schmidt + * + */ +public class GoogleBooksCsvReader implements PhraseReader { + + private final BufferedReader reader; + private String lastLine = null; + + public GoogleBooksCsvReader(BufferedReader reader) { + this.reader = reader; + } + + @Override + public PhraseFrequencyPair nextPair() throws IOException { + String line = lastLine == null ? reader.readLine() : lastLine; + + MutablePhraseFrequencyPair pair = new MutablePhraseFrequencyPair(null, -1); + while (line != null && !parseLine(line, pair)) { + // we read lines until we find one which parses or arrive at the end + line = reader.readLine(); + } + if (line == null || pair.phrase == null) + return null; + + // aggregate the frequencies of the next lines which also have the current + // phrase + String currentPhrase = pair.phrase; + long currentFrequency = pair.frequency; + + String nextLine; + while ((nextLine = reader.readLine()) != null) { + if (parseLine(nextLine, pair)) { + if (currentPhrase.contentEquals(pair.phrase)) { + currentFrequency += pair.frequency; + } else { + break; + } + } + } + lastLine = nextLine; + + return new PhraseFrequencyPair(currentPhrase, currentFrequency); + } + + /** + * This parses a CSV line. + *

+ * Returns {@code false} if the given line could not be parsed. + */ + private static boolean parseLine(String line, MutablePhraseFrequencyPair pair) { + // e.g. "circumvallate\t1978\t313\t215\t85" + // "The first line tells us that in 1978, the word "circumvallate" occurred 313 + // times overall, on 215 distinct pages and in 85 distinct books." + + // this operation will be done millions of times, so I want to avoid + // String#split + + int firstTab = line.indexOf('\t', 0); + int secondTab = line.indexOf('\t', firstTab + 1); + int thirdTab = line.indexOf('\t', secondTab + 1); + if (firstTab == -1 || secondTab == -1 || thirdTab == -1) + return false; + + // phrases sometimes have a trailing space, so we have to remove that + String phrase = line.substring(0, firstTab).trim(); + // the empty string is not a valid phrase + if (phrase.isEmpty()) { + return false; + } + + pair.phrase = phrase; + pair.frequency = Long.parseLong(line.substring(secondTab + 1, thirdTab)); + + return true; + } + + @Override + public void close() throws IOException { + reader.close(); + } + + private static class MutablePhraseFrequencyPair { + + public String phrase; + public long frequency; + + public MutablePhraseFrequencyPair(final String phrase, final long frequency) { + this.phrase = phrase; + this.frequency = frequency; + } + + } + +} diff --git a/src/main/java/org/netspeak/io/PhraseFrequencyPair.java b/src/main/java/org/netspeak/io/PhraseFrequencyPair.java new file mode 100644 index 0000000..a44ec25 --- /dev/null +++ b/src/main/java/org/netspeak/io/PhraseFrequencyPair.java @@ -0,0 +1,40 @@ +package org.netspeak.io; + +import static java.util.Objects.requireNonNull; + +public class PhraseFrequencyPair { + + public final String phrase; + public final long frequency; + + /** + * Creates a new phrase frequency pair. + * + * @param phrase + * @param frequency + * @throws NullPointerException if the given phrase is {@code null}. + * @throws IllegalArgumentException if the given frequency is {@code <= 0}. + */ + public PhraseFrequencyPair(final String phrase, final long frequency) { + if (frequency <= 0) { + throw new IllegalArgumentException(); + } + this.phrase = requireNonNull(phrase); + this.frequency = frequency; + } + + @Override + public boolean equals(Object obj) { + if (obj instanceof PhraseFrequencyPair) { + PhraseFrequencyPair other = (PhraseFrequencyPair) obj; + return this.phrase.contentEquals(other.phrase) && this.frequency == other.frequency; + } + return false; + } + + @Override + public int hashCode() { + return phrase.hashCode() ^ (int) frequency ^ (int) (frequency >>> 32); + } + +} diff --git a/src/main/java/org/netspeak/io/PhraseReader.java b/src/main/java/org/netspeak/io/PhraseReader.java new file mode 100644 index 0000000..ed8d31a --- /dev/null +++ b/src/main/java/org/netspeak/io/PhraseReader.java @@ -0,0 +1,22 @@ +package org.netspeak.io; + +/** + * A interface for readers which return one phrase-frequency-pair at a time. + * + * @see GoogleBooksCsvReader + * @see SimpleCsvReader + * + * @author Michael + */ +public interface PhraseReader extends AutoCloseable { + + /** + * Returns the next phrase-frequency-pair or {@code null} if no other pairs will + * be returned. + * + * @return + * @throws Exception + */ + PhraseFrequencyPair nextPair() throws Exception; + +} diff --git a/src/main/java/org/netspeak/io/PhraseWriter.java b/src/main/java/org/netspeak/io/PhraseWriter.java new file mode 100644 index 0000000..9052ee1 --- /dev/null +++ b/src/main/java/org/netspeak/io/PhraseWriter.java @@ -0,0 +1,24 @@ +package org.netspeak.io; + +public interface PhraseWriter extends AutoCloseable { + + /** + * Writes the given phrase and frequency. + * + * @param phrase + * @param frequency + * @throws Exception + */ + void write(String phrase, long frequency) throws Exception; + + /** + * Writes the given phrase-frequency-pair. + * + * @param pair + * @throws Exception + */ + default void write(PhraseFrequencyPair pair) throws Exception { + this.write(pair.phrase, pair.frequency); + } + +} diff --git a/src/main/java/org/netspeak/io/SimpleCsvReader.java b/src/main/java/org/netspeak/io/SimpleCsvReader.java new file mode 100644 index 0000000..a960cfe --- /dev/null +++ b/src/main/java/org/netspeak/io/SimpleCsvReader.java @@ -0,0 +1,61 @@ +package org.netspeak.io; + +import java.io.BufferedReader; +import java.io.IOException; + +/** + * A reader for simple CSV files. + *

+ * In these CSV files, every line ({@code \n}) contains a phrase followed by a + * single tab ({@code \t}) followed by the frequency of that phrase. There may + * be duplicate phrases. A phrase is a non-empty list of words each separated by + * a single whitespace ({@code \u0020}) with no leading or trailing spaces. + * + *

+ * hello world	20
+ * i love you	100
+ * hello world	5
+ * 
+ * + * @author Michael Schmidt + * + */ +public class SimpleCsvReader implements PhraseReader { + + private final BufferedReader reader; + + public SimpleCsvReader(BufferedReader reader) { + this.reader = reader; + } + + @Override + public PhraseFrequencyPair nextPair() throws IOException { + String line = reader.readLine(); + + if (line != null) { + // For better performance, we avoid String#split. Instead we know that a line + // only contains one \t, so we search for that index. To validate the format, we + // also search for a second \t. This is equivalent to: + // String[] parts = line.split("\t"); + // if (parts.length == 2) { create the pair } else { null } + int firstTab = line.indexOf('\t'); + int secondTab = line.indexOf('\t', firstTab + 1); + + // The first tab has to exist and it cannot be 0 because the phrase cannot be + // the empty string. The second tab has to not exist. + if (firstTab > 0 && secondTab == -1) { + String phrase = line.substring(0, firstTab); + long frequency = Long.parseLong(line.substring(firstTab + 1)); + return new PhraseFrequencyPair(phrase, frequency); + } + } + + return null; + } + + @Override + public void close() throws IOException { + reader.close(); + } + +} diff --git a/src/main/java/org/netspeak/io/SimpleCsvWriter.java b/src/main/java/org/netspeak/io/SimpleCsvWriter.java new file mode 100755 index 0000000..e92e968 --- /dev/null +++ b/src/main/java/org/netspeak/io/SimpleCsvWriter.java @@ -0,0 +1,31 @@ +package org.netspeak.io; + +import java.io.BufferedWriter; +import java.io.IOException; + +/** + * A writer for CSV files which can be understood by the Netspeak index builder. + *

+ * For more details on the format see {@link SimpleCsvReader}. + * + * @author Michael Schmidt + */ +public class SimpleCsvWriter implements PhraseWriter { + + private final BufferedWriter writer; + + public SimpleCsvWriter(BufferedWriter writer) { + this.writer = writer; + } + + @Override + public void write(String phrase, long frequency) throws IOException { + writer.append(phrase).append('\t').append(Long.toString(frequency)).append('\n'); + } + + @Override + public void close() throws IOException { + writer.close(); + } + +} diff --git a/src/main/java/org/netspeak/io/SplitterCsvWriter.java b/src/main/java/org/netspeak/io/SplitterCsvWriter.java new file mode 100644 index 0000000..37cfa0f --- /dev/null +++ b/src/main/java/org/netspeak/io/SplitterCsvWriter.java @@ -0,0 +1,85 @@ +package org.netspeak.io; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static java.nio.file.StandardOpenOption.CREATE_NEW; + +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.Writer; +import java.nio.charset.CharsetEncoder; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; + +/** + * This CSV writer will create a given number of CSV files which will be used as + * buckets where phrases will be assigned a bucket according to their hash. + * These bags can then be used for further processing. + *

+ * The {@link #write(String, long)} and {@link #write(PhraseFrequencyPair)} + * methods are thread-safe. + * + * @author Michael + * + */ +public class SplitterCsvWriter implements PhraseWriter { + + private final SimpleCsvWriter[] writers; + private final Path destDir; + private boolean initialized = false; + + public SplitterCsvWriter(Path destDir, int bucketCount) { + this.writers = new SimpleCsvWriter[bucketCount]; + this.destDir = destDir; + } + + @Override + public void close() throws Exception { + Exception last = null; + + for (SimpleCsvWriter writer : writers) { + try { + if (writer != null) + writer.close(); + } catch (Exception e) { + last = e; + } + } + + if (last != null) + throw last; + } + + @Override + public void write(String phrase, long frequency) throws IOException { + initializeWriters(); + + int index = phrase.hashCode() % writers.length; + if (index < 0) + index += writers.length; + SimpleCsvWriter writer = writers[index]; + synchronized (writer) { + writer.write(phrase, frequency); + } + } + + private final void initializeWriters() throws IOException { + if (initialized) + return; + synchronized (this) { + if (initialized) + return; + + for (int i = 0; i < writers.length; i++) { + Path path = Paths.get(destDir.toString(), String.valueOf(i) + ".csv"); + CharsetEncoder encoder = UTF_8.newEncoder(); + Writer writer = new OutputStreamWriter(Files.newOutputStream(path, CREATE_NEW), encoder); + writers[i] = new SimpleCsvWriter(new BufferedWriter(writer, 1024 * 256)); + } + + initialized = true; + } + } + +} diff --git a/src/main/java/org/netspeak/preprocessing/ContractionMapper.java b/src/main/java/org/netspeak/preprocessing/ContractionMapper.java new file mode 100755 index 0000000..b3cfdeb --- /dev/null +++ b/src/main/java/org/netspeak/preprocessing/ContractionMapper.java @@ -0,0 +1,246 @@ +package org.netspeak.preprocessing; + +import org.netspeak.Util; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.function.Consumer; +import java.util.regex.Pattern; + +import static java.util.Locale.ENGLISH; + +public class ContractionMapper implements PhraseMapper { + + private final Pattern contractionPattern; + private final Pattern incompleteContractionPattern; + private final Map knownContractionMap = new HashMap<>(); + private static final Pattern POSSESSIVE_S_PATTERN = Pattern.compile("s '(?= |\\z)", Pattern.CASE_INSENSITIVE); + + public ContractionMapper(Path file) throws IOException { + this(Util.readWordList(file)); + } + + public ContractionMapper(Iterable knownContractions) { + StringBuilder pattern = new StringBuilder(); + Set incompleteContractionSuffixes = new HashSet<>(); + Set incompleteContractionPrefixes = new HashSet<>(); + + for (String known : knownContractions) { + // add know contractions ending with "n't" without ' to the map + // we can't do this for contractions like "we'll" or "i'm" because of the false + // positives + if (known.endsWith("n't")) { + for (String contraction : allCombinations(known)) { + int index = contraction.indexOf('\''); + knownContractionMap.put(contraction.replace("'", "").toLowerCase(ENGLISH), index); + } + } + + // add prefixes and suffixes to lists + int apo = known.indexOf('\''); + incompleteContractionPrefixes.addAll(allCombinations(known.substring(0, apo))); + incompleteContractionSuffixes.addAll(allCombinations(known.substring(apo + 1))); + + // make it all non-capturing for better performance + known = known.replace("\\((?!\\?)", "(?:"); + // replace the ' with all variations + known = known.replace("'", "(?: '|' | ' | )"); + + pattern.append(known); + pattern.append('|'); + } + pattern.append("[^\\s\\S]"); + + // contractionPattern + String finalPattern = "(?<= |\\A)(?:" + pattern.toString() + ")(?= |\\z)"; + // n't can be fixed with minimal context as it's the only contraction with both + // prefix and suffix + finalPattern += "|n(?: '|' | ' )t(?= |\\z)"; + // join possessive S + finalPattern += "|(?: '|' | ' )s(?= |\\z)"; + + contractionPattern = Pattern.compile(finalPattern, Pattern.CASE_INSENSITIVE); + + // incompleteContractionPattern + incompleteContractionPrefixes.remove(""); + incompleteContractionSuffixes.remove(""); + + StringBuilder incompletePattern = new StringBuilder(); + incompletePattern.append("(?:\\A| )(?:"); + incompletePattern.append(String.join("|", incompleteContractionPrefixes)); + incompletePattern.append(")(?: ?'\\z)"); + incompletePattern.append("|"); + incompletePattern.append("(?:\\A' ?)(?:"); + incompletePattern.append(String.join("|", incompleteContractionSuffixes)); + incompletePattern.append(")(?: |\\z)"); + + incompleteContractionPattern = Pattern.compile(incompletePattern.toString(), Pattern.CASE_INSENSITIVE); + } + + + @Override + public String map(String phrase, long frequency) { + // phrases with incomplete contractions will be removed + if (incompleteContractionPattern.matcher(phrase).find()) { + return null; + } + + + phrase = Util.replaceAll(contractionPattern, phrase, match -> { + String m = match.group(); + if (m.indexOf('\'') == -1) { + // e.g. "don t" + return m.replace(' ', '\''); + } else { + // e.g. "don' t" or "don 't" or "don ' t" + return m.replace(" ", ""); + } + }); + + String[] words = phrase.split(" "); + boolean changed = false; + for (int i = 0; i < words.length; i++) { + String word = words[i]; + String lowercase = word.toLowerCase(ENGLISH); + Integer ind = knownContractionMap.get(lowercase); + if (ind != null) { + int index = ind; + words[i] = word.substring(0, index) + '\'' + word.substring(index); + changed = true; + } + } + + phrase = changed ? Util.toPhrase(words) : phrase; + + phrase = POSSESSIVE_S_PATTERN.matcher(phrase).replaceAll("s'"); + + return phrase; + } + + private static List allCombinations(String pattern) { + List alternatives = new ArrayList<>(); + parseAlternation(pattern, 0, alternatives::add); + + List words = new ArrayList<>(); + for (Concatenation concat : alternatives) { + List builders = new ArrayList<>(); + builders.add(new StringBuilder()); + addCombinations(builders, concat); + builders.forEach(b -> words.add(b.toString())); + } + + return words; + } + + private static void addCombinations(List builders, Concatenation concat) { + for (Element e : concat.getElements()) { + if (e instanceof Literal) { + String value = ((Literal) e).toString(); + builders.forEach(b -> b.append(value)); + } else { + List alternatives = ((Alternation) e).getConcatenations(); + List original = new ArrayList<>(builders); + builders.clear(); + for (Concatenation alternative : alternatives) { + List newBuilders = new ArrayList<>(); + original.forEach(b -> newBuilders.add(new StringBuilder(b))); + addCombinations(newBuilders, alternative); + builders.addAll(newBuilders); + } + } + } + } + + private static int parseAlternation(String pattern, final int startIndex, Consumer consumeConcat) { + int index = startIndex; + + List concat = new ArrayList<>(); + + while (index < pattern.length()) { + char c = pattern.charAt(index++); + if (c == ')') + break; + if (c == '(') { + List alternatives = new ArrayList<>(); + index += parseAlternation(pattern, index, alternatives::add); + if (alternatives.size() == 1) { + concat.addAll(alternatives.get(0).getElements()); + } else { + concat.add(new Alternation(alternatives)); + } + } else if (c == '|') { + consumeConcat.accept(new Concatenation(concat)); + concat = new ArrayList<>(); + } else { + boolean added = false; + if (!concat.isEmpty()) { + Element last = concat.get(concat.size() - 1); + if (last instanceof Literal) { + ((Literal) last).append(c); + added = true; + } + } + if (!added) { + concat.add(new Literal(c)); + } + } + } + + consumeConcat.accept(new Concatenation(concat)); + + return index - startIndex; + } + + private interface Element { + } + + private static class Concatenation { + private final List elements; + + public Concatenation(List elements) { + this.elements = elements; + } + + public List getElements() { + return elements; + } + + } + + private static class Literal implements Element { + private String value; + + public Literal(char value) { + this.value = Character.toString(value); + } + + public void append(char c) { + this.value += c; + } + + @Override + public String toString() { + return value; + } + + } + + private static class Alternation implements Element { + private final List concatenations; + + public Alternation(List concatenations) { + this.concatenations = concatenations; + } + + public List getConcatenations() { + return concatenations; + } + } + +} diff --git a/src/main/java/org/netspeak/preprocessing/HyphenationJoiner.java b/src/main/java/org/netspeak/preprocessing/HyphenationJoiner.java new file mode 100755 index 0000000..e27a2f9 --- /dev/null +++ b/src/main/java/org/netspeak/preprocessing/HyphenationJoiner.java @@ -0,0 +1,337 @@ +package org.netspeak.preprocessing; + +import org.netspeak.Util; +import org.netspeak.Util.ThrowsConsumer; + +import java.io.IOException; +import java.io.PrintStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashSet; +import java.util.Set; + +import static java.util.Objects.requireNonNull; + +/** + * This will join all hyphenated words in two phases. + *

+ * In the first pass it will iterate over all input phrases and extract the vocabulary and stop words. This will be done + * according to the given options. + *

+ * The second phase is specific to the joiner set. Generally, it will try to normalize and, where possible, join + * hyphenated words. + * + * @see German + */ +public class HyphenationJoiner implements PipelineItem { + + private Path logFile; + private ThrowsConsumer vocabularyConsumer; + + private final Joiner joiner; + private final Path output; + private final PreprocessingOptions options; + + public HyphenationJoiner(JoinerFactory joinerFactory, Path output, PreprocessingOptions options) throws Exception { + this.joiner = requireNonNull(joinerFactory).create(); + this.output = requireNonNull(output); + this.options = requireNonNull(options); + } + + /** + * Sets an optional log file. + *

+ * Every action the method takes will be logged here in order where each line is one operation. The result and the + * reason for not joining will be in the following format: + * + *

+	 * {action}:[ {result}:] {phrase}
+	 * 
+ *

+ * The actions are and results are implementation defined and might be different for each language specific joiner. + * See the implementation of the current joiner for more information. + */ + public void setLogger(Path logFile) { + this.logFile = logFile; + } + + public void setVocabularyConsumer(ThrowsConsumer vocabularyConsumer) { + this.vocabularyConsumer = vocabularyConsumer; + } + + @Override + public PhraseSource apply(PhraseSource source) throws Exception { + // Pass 1 + + if (joiner.getRequiresVocabulary()) { + System.out.println("Extracting vocabulary..."); + + VocabularyExtractor vocabExtractor = new VocabularyExtractor(); + Preprocessing.iterate(source, Arrays.asList(vocabExtractor), options); + + System.out.println("Preparing vocabulary..."); + if (vocabularyConsumer != null) { + vocabularyConsumer.accept(vocabExtractor); + } + + Set vocabulary = vocabExtractor.getVocabulary(); + vocabExtractor = null; + System.gc(); + + joiner.setVocabulary(vocabulary); + } + + System.out.println("Joining Hyphenations..."); + options.setMergeDuplicates(true); // this operation is going to create duplicates + + // We use Java 8, so we have to give it a charset name, so it can lookup the charset instance. In never version + // you can give it an instance directly. + String charsetName = StandardCharsets.UTF_8.name(); + final PrintStream logger = logFile == null ? null : new PrintStream(logFile.toFile(), charsetName); + try { + joiner.setLogger(logger); + + return Preprocessing.process(source, output, Arrays.asList(joiner), options); + } finally { + if (logger != null) { + logger.close(); + } + } + } + + public interface Joiner extends PhraseMapper { + boolean getRequiresVocabulary(); + + void setVocabulary(Set vocabulary); + + void setLogger(PrintStream logger); + } + + public interface JoinerFactory { + Joiner create() throws Exception; + } + + public static class German implements JoinerFactory { + /** + * The top k words from the vocabulary will be treated as stop words. This will set the k. + */ + int stopWordsTopK = 100; + /** + * An optional stop word list. + *

+ * This list will be merged with the top k stop words from the vocabulary. + */ + Collection stopWordList = null; + + public void setStopWordsTopK(int stopWordsTopK) { + this.stopWordsTopK = stopWordsTopK; + } + + public void setStopWordList(Path stopWordList) throws IOException { + this.stopWordList = Util.readWordList(stopWordList); + } + + public void setStopWordList(Collection stopWordList) { + this.stopWordList = stopWordList; + } + + @Override + public Joiner create() throws Exception { + return new GermanJoiner(this); + } + } + + private static class GermanJoiner implements Joiner { + + private final German options; + + private Set vocabulary; + private Set stopWords = new HashSet<>(); + + private PrintStream logger; + + public GermanJoiner(German options) throws IOException { + this.options = requireNonNull(options); + if (options.stopWordList != null) + stopWords.addAll(options.stopWordList); + } + + @Override + public void setVocabulary(Set vocabulary) { + this.vocabulary = vocabulary; + HyphenationJoiner.addTopK(stopWords, vocabulary, options.stopWordsTopK); + } + + @Override + public void setLogger(PrintStream logger) { + this.logger = logger; + } + + @Override + public boolean getRequiresVocabulary() { + return true; + } + + private String[] normalizeHyphens(String[] words, String phrase) { + if (words.length < 2) + return words; + + int toRemove = 0; + for (int i = 1; i < words.length; i++) { + if ("-".contentEquals(words[i])) + toRemove++; + } + + if (toRemove == 0) + return words; + + String[] newWords = new String[words.length - toRemove]; + newWords[0] = words[0]; + int writeIndex = 1; + for (int i = 1; i < words.length; i++) { + String word = words[i]; + if ("-".contentEquals(words[i])) { + newWords[writeIndex - 1] = newWords[writeIndex - 1] + "-"; + } else { + newWords[writeIndex++] = word; + } + } + + if (logger != null) { + logger.println("Normalize: " + Util.toPhrase(newWords) + ": " + phrase); + } + + return newWords; + } + + private String[] joinHyphen(String[] words, String phrase) { + /** + * For all pairs matching the pattern `{words1}- {words2}`, we want to transform + * them to either `{words1}{words2}`, `{words1}-{words2}`, or leave them as is. + */ + + for (int i = 0; i < words.length - 1; i++) { + String word = words[i]; + String next = words[i + 1]; + if (word.length() > 1 && word.charAt(word.length() - 1) == '-') { + + // if the next word is a stop word, we leave it as is. + if (stopWords.contains(next)) { + if (logger != null) { + logger.println("Stop word: " + next + ": " + phrase); + } + continue; + } + + String result = null; + + /** + * To do the word join {word1}{word2}, 3 criteria have to be met: + * + * 1. {word2} can't be a stop word.
+ * 2. {word2} has to begin with a lower case letter.
+ * 3. The concatenation {word1}{word2} has to be a known word. + */ + + if (Character.isLowerCase(next.charAt(0))) { + String concat = word.substring(0, word.length() - 1) + next; + if (vocabulary.contains(concat)) { + result = concat; + if (logger != null) { + logger.println("Full join: " + concat + ": " + phrase); + } + } + } + + words[i] = null; + words[i + 1] = result == null ? word + next : result; + } + } + + return HyphenationJoiner.removeNull(words); + } + + @Override + public String map(String phrase, long frequency) { + if (phrase.indexOf('-') == -1) + return phrase; + + String[] words = normalizeHyphens(phrase.split(" "), phrase); + + words = joinHyphen(words, phrase); + + return Util.toPhrase(words); + } + + } + + public static class English implements JoinerFactory { + @Override + public Joiner create() throws Exception { + return new EnglishJoiner(); + } + } + + private static class EnglishJoiner implements Joiner { + + private PrintStream logger; + + @Override + public void setVocabulary(Set vocabulary) { + throw new UnsupportedOperationException(); + } + + @Override + public void setLogger(PrintStream logger) { + this.logger = logger; + } + + @Override + public boolean getRequiresVocabulary() { + return false; + } + + @Override + public String map(String phrase, long frequency) { + if (phrase.indexOf(" - ") == -1) + return phrase; + + String newPhrase = phrase.replace(" - ", "-"); + if (logger != null) { + logger.println("Join: " + newPhrase + ": " + phrase); + } + + return newPhrase; + } + + } + + private static String[] removeNull(String[] words) { + int nullEntries = 0; + for (String word : words) + if (word == null) + nullEntries++; + + if (nullEntries == 0) + return words; + + String[] newWords = new String[words.length - nullEntries]; + int writeIndex = 0; + for (String w : words) { + if (w != null) + newWords[writeIndex++] = w; + } + return newWords; + } + + private static void addTopK(Collection consumer, Collection supplier, int k) { + for (T item : supplier) { + if (k-- <= 0) + break; + consumer.add(item); + } + } + +} diff --git a/src/main/java/org/netspeak/preprocessing/Operations.java b/src/main/java/org/netspeak/preprocessing/Operations.java new file mode 100644 index 0000000..1d71602 --- /dev/null +++ b/src/main/java/org/netspeak/preprocessing/Operations.java @@ -0,0 +1,198 @@ +package org.netspeak.preprocessing; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +import org.netspeak.Util; + +public abstract class Operations { + + private Operations() { + } + + public static PipelineItem standardOperations(Path output, StandardOperationsOptions operationOptions, + PreprocessingOptions options) { + return source -> { + List mappers = new ArrayList<>(); + + // try to remove as much junk as possible + // In this phase, phrases will only be removed and not altered. + mappers.add(PhraseMappers.removeControlCharacters()); + if (operationOptions.superBlacklist != null) { + mappers.add(PhraseMappers.superBlacklist(operationOptions.superBlacklist)); + } + mappers.add(PhraseMappers.removeGoogleWebMarkers()); + mappers.add(PhraseMappers.removeHTMLEntities()); + mappers.add(PhraseMappers.removeURLsAndEmails()); + mappers.add(PhraseMappers.removeFileNames()); + + // Normalization phase + mappers.add(PhraseMappers.explodeCommas()); + mappers.add(PhraseMappers.removeLeadingDoubleQuote()); + mappers.add(PhraseMappers.joinWordsWithLeadingApostrophe()); + + if (operationOptions.blacklist != null) { + mappers.add( + PhraseMappers.blacklist(operationOptions.blacklist, operationOptions.blacklistCombinations)); + } + if (operationOptions.maxNGram < Integer.MAX_VALUE) { + mappers.add(PhraseMappers.maxNGram(operationOptions.maxNGram)); + } + if (operationOptions.toLowerCase) { + mappers.add(PhraseMappers.toLowerCase()); + } + + if (operationOptions.additionalMappers != null) { + mappers.addAll(operationOptions.additionalMappers); + } + + // the above operations are going to produce duplicates + options.setMergeDuplicates(true); + + return Preprocessing.process(source, output, mappers, options); + }; + } + + public static class StandardOperationsOptions { + /** + * The maximum allowed number of words per phrase. + */ + int maxNGram = Integer.MAX_VALUE; + /** + * Whether all phrases should be lower-cased. + */ + boolean toLowerCase = false; + /** + * All phrases with at least one word which can be constructed from at most + * {@link #blacklistCombinations} many blacklisted word will be removed. + */ + Collection blacklist = null; + int blacklistCombinations = 4; + /** + * @see PhraseMappers#superBlacklist(Iterable) + */ + Collection superBlacklist = null; + /** + * Additional mappers which will be executed after the mappers defined by the + * method. + */ + List additionalMappers = new ArrayList<>(); + + public void setBlacklist(Path blacklist) throws IOException { + this.blacklist = Util.readWordList(blacklist); + } + + public void setBlacklist(Collection blacklist) { + this.blacklist = blacklist; + } + + public void setBlacklistCombinations(int blacklistCombinations) { + this.blacklistCombinations = blacklistCombinations; + } + + public void setSuperBlacklist(Path superBlacklist) throws IOException { + this.superBlacklist = Util.readWordList(superBlacklist); + } + + public void setSuperBlacklist(Collection superBlacklist) { + this.superBlacklist = superBlacklist; + } + + public List getAdditionalMappers() { + return additionalMappers; + } + + public void setToLowerCase(boolean toLowerCase) { + this.toLowerCase = toLowerCase; + } + + public void setMaxNGram(int maxNGram) { + this.maxNGram = maxNGram; + } + } + + /** + * Moves all files to the given directory. + * + * @param output The directory to move to. + * @return + */ + public static PipelineItem moveTo(Path output) { + return source -> { + Path dest = output.toAbsolutePath(); + System.out.println("Moving to " + dest); + System.out.println("From:"); + System.out.println(source); + + Util.createEmptyDirectory(dest); + List newSources = new ArrayList<>(); + moveTo(newSources, source, dest); + + System.out.println("Done."); + + if (newSources.size() == 1) { + return newSources.get(0); + } else { + return PhraseSource.combine(newSources); + } + }; + } + + /** + * Moves all files to the given directory. + * + * @param output The directory to move to. + * @return + */ + public static PipelineItem moveTo(String output) { + return moveTo(Paths.get(output)); + } + + private static void moveTo(List out, PhraseSource source, Path dest) throws Exception { + if (source instanceof PhraseSource.Combined) { + for (PhraseSource s : ((PhraseSource.Combined) source).getSources()) { + moveTo(out, s, dest); + } + } else if (source instanceof SimplePhraseSource) { + SimplePhraseSource simple = (SimplePhraseSource) source; + // actually move some files + for (PhraseSource.File file : simple.getFiles()) { + Files.move(file.getPath(), dest.resolve(file.getPath().getFileName())); + } + + SimplePhraseSource newSource = new SimplePhraseSource(dest); + newSource.setReaderFactory(simple.readerFactory); + out.add(newSource); + } else { + throw new UnsupportedOperationException( + "Cannot move files of unknown source class " + source.getClass().getName()); + } + } + + /** + * Deletes all files of the input phrase source. + *

+ * The item will return {@link PhraseSource#EMPTY}. + * + * @return + */ + public static PipelineItem delete() { + return source -> { + System.out.println("Deleting:"); + System.out.println(source); + + for (PhraseSource.File file : source.getFiles()) { + Files.delete(file.getPath()); + } + + System.out.println("Done."); + return PhraseSource.EMPTY; + }; + } + +} diff --git a/src/main/java/org/netspeak/preprocessing/PhraseMapper.java b/src/main/java/org/netspeak/preprocessing/PhraseMapper.java new file mode 100755 index 0000000..908bdc7 --- /dev/null +++ b/src/main/java/org/netspeak/preprocessing/PhraseMapper.java @@ -0,0 +1,77 @@ +package org.netspeak.preprocessing; + +/** + * An interface providing a {@link #map(String, long)} function that transforms + * a given phrase. This interface can be used to apply certain string operations + * on phrases, such as case conversion or removal. Filter instances can be + * organized in some sort of collection to be applied one by one on the same + * phrase. + */ +@FunctionalInterface +public interface PhraseMapper { + + /** + * Maps a given input {@code phrase} to some output phrase. The returned phrase + * may be {@code null} or the empty string in which case the phrase will be + * removed from the corpus. + *

+ * The returned phrase is not allowed to contain tabs, line breaks, + * adjacent spaces, and leading or trailing spaces. + * + * @param phrase The input phrase string. This is guaranteed to not be + * {@code null} and to not be the empty string. + * @param frequency The phrase frequency. + * @return The filtered phrase string. + */ + String map(String phrase, long frequency); + + /** + * The name of the PhraseMapper. + *

+ * This name can be useful for diagnostics and will be used by + * {@link Preprocessing} when printing information about a {@link PhraseMapper}. + * By default this will be the name of the class of the mapper. + * + * @return + */ + default String getName() { + return getClass().getName(); + } + + /** + * Returns a new {@link PhraseMapper} which behaves like the given + * {@link PhraseMapper} and with the name of the full name of the caller method. + * + * @param mapper + * @return + */ + static PhraseMapper rename(PhraseMapper mapper) { + StackTraceElement[] stack = Thread.currentThread().getStackTrace(); + StackTraceElement caller = stack[2]; + return rename(caller.getClassName() + "." + caller.getMethodName(), mapper); + } + + /** + * Returns a new {@link PhraseMapper} with the given name which behaves like the + * given {@link PhraseMapper}. + * + * @param name + * @param mapper + * @return + */ + static PhraseMapper rename(String name, PhraseMapper mapper) { + return new PhraseMapper() { + + @Override + public String map(String phrase, long frequency) { + return mapper.map(phrase, frequency); + } + + @Override + public String getName() { + return name; + } + }; + } + +} diff --git a/src/main/java/org/netspeak/preprocessing/PhraseMappers.java b/src/main/java/org/netspeak/preprocessing/PhraseMappers.java new file mode 100755 index 0000000..74557e9 --- /dev/null +++ b/src/main/java/org/netspeak/preprocessing/PhraseMappers.java @@ -0,0 +1,436 @@ +package org.netspeak.preprocessing; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.function.Predicate; +import java.util.regex.Pattern; + +/** + * Some common {@link PhraseMapper} to be used in {@link Preprocessing}. + */ +public final class PhraseMappers { + + /** + * Returns a new {@link PhraseMapper} which converts phrases to lower case. + *

+ * Example: "You'll make it" becomes "you'll make it" + *

+ * + * @return + */ + public static PhraseMapper toLowerCase() { + return PhraseMapper.rename((phrase, frequency) -> phrase.toLowerCase()); + } + + /** + * Returns a new {@link PhraseMapper} which removes one leading double quote + * from a word. + *

+ * Example: "fo"o ""bar" will become fo"o "bar" and + * " foo will stay " foo + *

+ * + * @return + */ + public static PhraseMapper removeLeadingDoubleQuote() { + return PhraseMapper.rename((phrase, frequency) -> LEADING_DOUBLE_QUOTE_PATTERN.matcher(phrase).replaceAll("")); + } + + private static final Pattern LEADING_DOUBLE_QUOTE_PATTERN = Pattern + .compile("(?:(?!\\G)|\\A)(?:\\A|(?<= ))\"(?=[^ ])"); + + /** + * Returns a new {@link PhraseMapper} which joins two consecutive words within + * the phrase if the second word starts with an apostrophe. + *

+ * Example: "You 'll make it" will become + * "You'll make it" and "don 't" will become + * "don't" + *

+ * + * @return + */ + public static PhraseMapper joinWordsWithLeadingApostrophe() { + return PhraseMapper.rename((phrase, frequency) -> phrase.replace(" '", "'")); + } + + /** + * Returns a new {@link PhraseMapper} that removes phrases which contain at + * least one word that is contained in a given blacklist vocabulary. + * + * @param words + * @return + */ + public static PhraseMapper blacklist(final Collection words) { + return PhraseMapper.rename(blacklist(words, 1)); + } + + /** + * Returns a new {@link PhraseMapper} that removes phrases which contain at + * least one word that is an + * HTML + * entity. + * + * @return + */ + public static PhraseMapper removeHTMLEntities() { + return PhraseMapper.rename(filterByWords(w -> !(w.charAt(0) == '&' && w.charAt(w.length() - 1) == ';'))); + } + + /** + * Removes all control characters. + * + * See + * here + * for more details. + * + * @return + */ + public static PhraseMapper removeControlCharacters() { + return PhraseMapper.rename((phrase, freq) -> { + int l = phrase.length(); + for (int i = 0; i < l; i++) { + char c = phrase.charAt(i); + if (c < ' ') // \x00 - \x1F + return null; + if (0x7F <= c && c <= 0x9F) // DEL, \x80 - \x9F + return null; + } + return phrase; + }); + } + + /** + * Returns a new {@link PhraseMapper} that removes phrases which contain at + * least one word that is contained in a given blacklist vocabulary. + *

+ * Phrases which contains a word which can be constructed by concatenating + * {@code <= repeating} many words from the blacklist will also be removed. I.e. + * if {@code "} and {@code ?} are in the blacklist and {@code repeating} is 3, + * then {@code """}, {@code "?"}, {@code "?}, and {@code ??} will all be + * removed. + *

+ * Please note that the blacklist will consume {@code O(n ** repeat)} + * many bytes of memory where {@code n} is the number of blacklist entries. + * + * @param words + * @return + */ + public static PhraseMapper blacklist(final Collection words, int repeat) { + HashSet tempBlacklist = new HashSet<>(); + tempBlacklist.addAll(words); + + // just to be safe + tempBlacklist.remove(null); + tempBlacklist.remove(""); + + if (repeat > 1) { + tempBlacklist = new HashSet<>(getAllCombinations(tempBlacklist, repeat)); + } + + // thanks Java + final Set blacklist = tempBlacklist; + + return PhraseMapper.rename(filterByWords(w -> !blacklist.contains(w))); + } + + private static List getAllCombinations(Collection words, int repeat) { + ArrayList combinations = new ArrayList<>((int) Math.pow(words.size(), repeat)); + combinations.addAll(words); + + int start = 0; + for (; repeat > 1; repeat--) { + int size = combinations.size(); + for (int i = start; i < size; i++) { + for (String word : words) { + combinations.add(combinations.get(i) + word); + } + } + start = size; + } + + return combinations; + } + + /** + * Returns a new {@link PhraseMapper} that removes phrases which contain at + * least one word that is not contained in an given whitelist vocabulary. + * + * @param words + * @return + */ + public static PhraseMapper whitelist(final Iterable words) { + final Set whitelist = new HashSet<>(); + for (String word : words) + whitelist.add(word); + + return PhraseMapper.rename(filterByWords(whitelist::contains)); + } + + /** + * Returns a {@link PhraseMapper} which filters out all words for which the + * given predicate returns {@code false}. + * + * @param wordPredicate + * @return + */ + public static PhraseMapper filterByWords(final Predicate wordPredicate) { + return PhraseMapper.rename((phrase, frequency) -> { + for (String word : phrase.split(" ")) { + if (!wordPredicate.test(word)) { + return null; + } + } + return phrase; + }); + } + + /** + * Similar to {@link PhraseMappers#blacklist(Collection)} with the difference + * being that all phrase which contain any of the given strings anywhere will be + * removed. + *

+ * E.g. A super blacklist with the string {@code "--"} will remove the phrase + * {@code "foo--bar"} while a normal blacklist will not. + * + * @param strings + * @return + */ + public static PhraseMapper superBlacklist(final Iterable strings) { + StringMatcherNode matcher = StringMatcherNode.createRoot(strings); + return PhraseMapper.rename((phrase, freq) -> { + int l = phrase.length(); + for (int i = 0; i < l; i++) { + if (matcher.matches(phrase, i)) { + return null; + } + } + return phrase; + }); + } + + private static class StringMatcherNode { + + private static final StringMatcherNode ACCEPT = new StringMatcherNode(true); + + private final StringMatcherNode[] next; + + private StringMatcherNode(boolean accept) { + next = accept ? null : new StringMatcherNode[65536]; + } + + public boolean matches(String s, int index) { + if (this == ACCEPT) + return true; + + StringMatcherNode node = this; + int length = s.length(); + for (int i = index; i < length; i++) { + if (node == ACCEPT) + return true; + + int c = s.charAt(i); + node = node.next[c]; + if (node == null) + return false; + } + return node == ACCEPT; + } + + public static StringMatcherNode createRoot(final Iterable words) { + final StringMatcherNode root = new StringMatcherNode(false); + + for (String word : words) { + int length = word.length(); + if (length == 0) + return ACCEPT; + + StringMatcherNode node = root; + for (int i = 0; i < length; i++) { + int c = word.charAt(i); + if (i + 1 == length) { + node.next[c] = ACCEPT; + } else { + StringMatcherNode current = node.next[c]; + if (current == ACCEPT) + break; + if (current == null) + current = node.next[c] = new StringMatcherNode(false); + node = current; + } + } + } + + return root; + } + + } + + /** + * Returns a new {@link PhraseMapper} that removes phrases whose frequency is + * less than a given minimum frequency. + * + * @return + */ + public static PhraseMapper removeIfFrequencyIsLessThan(final long minimumFrequency) { + return PhraseMapper.rename((phrase, frequency) -> frequency < minimumFrequency ? null : phrase); + } + + /** + * Returns a new {@link PhraseMapper} that removes phrases that contain at least + * one character that is not included in the Latin-1 character set (ISO/IEC + * 8859-1). The Latin-1 character set contains all characters with code points + * in the range [0, 255]. ASCII is a subset of Latin-1 that covers the range [0, + * 127]. Since Latin-1 characters are encoded in 8 bit they are full compatible + * with languages that use simple 1-byte character types such as C or C++. You + * need to apply this filter as long as the native Netspeak C++ implementation + * has no built-in Unicode support. + * + * @return + */ + public static PhraseMapper removeIfContainsNonLatin1Chars() { + final int maxLatin1CodePoint = 255; + + return PhraseMapper.rename((phrase, frequency) -> { + for (int i = 0; i != phrase.length(); ++i) { + if (phrase.codePointAt(i) > maxLatin1CodePoint) { + return null; + } + } + return phrase; + }); + } + + /** + * Returns a new {@link PhraseMapper} that removes phrases that contain URLs or + * email addresses. + * + * @return + */ + public static PhraseMapper removeURLsAndEmails() { + return PhraseMapper.rename((phrase, frequency) -> { + String lower = phrase.toLowerCase(); + + // check for Email addresses + if (EMAIL_PATTERN.matcher(lower).find()) + return null; + // matches the URL pattern + if (URL_PATTERN.matcher(lower).find()) + return null; + + return phrase; + }); + } + + // Email addresses can be right about anything which contains an @. + private static final Pattern EMAIL_PATTERN = Pattern.compile(".@."); + private static final String ALL_COUNTRY_TLD = "a[cdefgilmoqrstuwxz]|b[abdefghijmnorstwyz]|c[acdfghiklmnoruvwxyz]|d[ejkmoz]|e[cegrstu]|f[ijkmor]|g[adefghilmnpqrstuwy]|h[kmnrtu]|i[delmnoqrst]|j[emop]|k[eghimnprwyz]|l[abcikrstuvy]|m[acdeghklmnopqrstuvwxyz]|n[acefgilopruz]|om|p[aefghklmnrstwy]|qa|r[eosuw]|s[abcdeghiklmnorstuvxyz]|t[cdfghjklmnortvwz]|u[agksyz]|v[aceginu]|w[fs]|y[et]|z[amw]"; + // some of the more common domains + // https://w3techs.com/technologies/overview/top_level_domain/all + private static final Pattern URL_PATTERN = Pattern + .compile("www\\.|https?:|ftps?:|\\.(?:com|org|net|edu|gov|xyz|moe|club|online|pro|site|top|shop|info|biz|" + + ALL_COUNTRY_TLD + ")\\b"); + + /** + * Returns a new {@link PhraseMapper} that removes phrases that contain URLs or + * email addresses. + * + * @return + */ + public static PhraseMapper removeFileNames() { + return PhraseMapper.rename((phrase, frequency) -> { + String lower = phrase.toLowerCase(); + + if (FILE_NAME_PATTERN.matcher(lower).find()) + return null; + + return phrase; + }); + } + + private static final Pattern FILE_NAME_PATTERN = Pattern.compile( + "\\.(?:exe|dll|bin|msi|bat|com|jar|pkg|apk|ini|ai|ico|jpg|jpeg|png|gif|bmp|webp|tif|tag|ps|odp|pps|ppt|pptx|pdf|doc|docx|xml|csv|sql|zip|rar|tar|gz|7z|iso|webm|mov|mkv|mpg|mpeg|mp3|acc|ogg|wav|wmv|mid|midi|mp4|avi|vlc|html|htm|php|asp|aspx|js|css)\\b"); + + /** + * This removes all phrases with additional markers in the Google web corpus. + * This includes: {@code }, {@code }, {@code }, {@code }, + * {@code }, and {@code }. + * + * @return + */ + public static PhraseMapper removeGoogleWebMarkers() { + return PhraseMapper.rename(blacklist(Arrays.asList("", "", "", "", "", ""))); + } + + /** + * This will make surrounding commas some words have its own word. + *

+ * + *

+	 * "foo," -> "foo ,"
+	 * ",foo,," -> ", foo, ,"
+	 * 
+ * + * @return + */ + public static PhraseMapper splitSurroundingCommas() { + return PhraseMapper.rename((phrase, freq) -> { + String[] words = phrase.split(" "); + for (int i = 0; i < words.length; i++) { + String word = words[i]; + int l = word.length(); + if (l > 1 && (word.charAt(0) == ',' || word.charAt(l - 1) == ',')) { + if (word.contentEquals(",,")) { + words[i] = ", ,"; + } else { + if (word.charAt(0) == ',') { + word = ", " + word.substring(1); + } + if (word.charAt(l - 1) == ',') { + word = word.substring(0, l - 1) + " ,"; + } + } + } + } + return String.join(" ", words); + }); + } + + public static PhraseMapper explodeCommas() { + return PhraseMapper.rename((phrase, freq) -> { + if (phrase.indexOf(',') >= 0) { + return normalizeSpaces(phrase.replace(",", " , ")); + } + return phrase; + }); + } + + /** + * This will remove all phrase which have more than {@code n} words. + * + * @param n The maximum number of words allowed per phrase. + * @return + */ + public static PhraseMapper maxNGram(int n) { + return PhraseMapper.rename((phrase, freq) -> { + int words = 1; + int l = phrase.length(); + for (int i = 0; i < l; i++) { + if (phrase.charAt(i) == ' ') + words++; + } + return words > n ? null : phrase; + }); + } + + private static final Pattern SPACES_PATTERN = Pattern.compile("\\s{2,}"); + + private static String normalizeSpaces(String str) { + return SPACES_PATTERN.matcher(str).replaceAll(" ").trim(); + } + +} diff --git a/src/main/java/org/netspeak/preprocessing/PhraseSource.java b/src/main/java/org/netspeak/preprocessing/PhraseSource.java new file mode 100644 index 0000000..a12f73a --- /dev/null +++ b/src/main/java/org/netspeak/preprocessing/PhraseSource.java @@ -0,0 +1,87 @@ +package org.netspeak.preprocessing; + +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; + +import org.netspeak.io.PhraseReader; + +/** + * A source of phrases. + * + * @see SimplePhraseSource + */ +public interface PhraseSource { + + Collection getFiles() throws Exception; + + public interface File { + + Path getPath(); + + PhraseReader createReader() throws Exception; + + } + + PhraseSource EMPTY = combine(); + + /** + * Returns a phrase source which contains the files of all the given sources. + * + * @param sources + * @return + */ + static PhraseSource combine(PhraseSource... sources) { + return combine(Arrays.asList(sources)); + } + + /** + * Returns a phrase source which contains the files of all the given sources. + * + * @param sources + * @return + */ + static PhraseSource combine(Collection sources) { + final ArrayList src = new ArrayList<>(sources); + return new Combined() { + + @Override + public Collection getSources() { + return src; + } + + @Override + public Collection getFiles() throws Exception { + List files = new ArrayList<>(); + for (PhraseSource source : src) { + files.addAll(source.getFiles()); + } + return files; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + boolean first = true; + for (PhraseSource source : src) { + if (first) { + first = false; + } else { + sb.append("\n"); + } + sb.append(source.toString()); + } + return sb.toString(); + } + }; + } + + interface Combined extends PhraseSource { + + Collection getSources(); + + } + +} diff --git a/src/main/java/org/netspeak/preprocessing/Pipeline.java b/src/main/java/org/netspeak/preprocessing/Pipeline.java new file mode 100644 index 0000000..9f400bd --- /dev/null +++ b/src/main/java/org/netspeak/preprocessing/Pipeline.java @@ -0,0 +1,28 @@ +package org.netspeak.preprocessing; + +import java.util.ArrayList; +import java.util.List; + +import org.netspeak.Util.ThrowsSupplier; + +public class Pipeline implements PipelineItem { + + private final List items = new ArrayList<>(); + + public void add(PipelineItem item) { + items.add(item); + } + + public void add(ThrowsSupplier supplier) { + items.add(supplier.get()); + } + + @Override + public PhraseSource apply(PhraseSource source) throws Exception { + for (PipelineItem item : items) { + source = item.apply(source); + } + return source; + } + +} diff --git a/src/main/java/org/netspeak/preprocessing/PipelineItem.java b/src/main/java/org/netspeak/preprocessing/PipelineItem.java new file mode 100644 index 0000000..02194ed --- /dev/null +++ b/src/main/java/org/netspeak/preprocessing/PipelineItem.java @@ -0,0 +1,8 @@ +package org.netspeak.preprocessing; + +@FunctionalInterface +public interface PipelineItem { + + PhraseSource apply(PhraseSource source) throws Exception; + +} diff --git a/src/main/java/org/netspeak/preprocessing/Preprocessing.java b/src/main/java/org/netspeak/preprocessing/Preprocessing.java new file mode 100755 index 0000000..fe79176 --- /dev/null +++ b/src/main/java/org/netspeak/preprocessing/Preprocessing.java @@ -0,0 +1,371 @@ +package org.netspeak.preprocessing; + +import org.netspeak.Util; +import org.netspeak.Util.ThrowsRunnable; +import org.netspeak.io.PhraseFrequencyPair; +import org.netspeak.io.PhraseReader; +import org.netspeak.io.PhraseWriter; +import org.netspeak.io.SimpleCsvReader; +import org.netspeak.io.SimpleCsvWriter; +import org.netspeak.io.SplitterCsvWriter; +import org.netspeak.preprocessing.PreprocessingOptions.DeleteMode; + +import java.io.IOException; +import java.math.BigDecimal; +import java.math.RoundingMode; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.time.Duration; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.atomic.LongAccumulator; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static java.util.Objects.requireNonNull; +import static java.util.concurrent.TimeUnit.DAYS; + +/** + *

+ * A class to convert a number of input phrase files to a number of output phrase files by applying user-defined filters + * on each input phrase. + *

+ * + *

+ * For that reason the user can register classes that implement the {@link PhraseMapper} interface to provide a certain + * filter functions. All filters will then be applied on each phrase in the same order they were registered. Some + * predefined {@link PhraseMapper} can be found in the {@link PhraseMappers} class. + *

+ */ +public final class Preprocessing { + + private Preprocessing() { + } + + /** + * Runs the entire preprocessing step which applies a number of filters on each phrase read from files located in + * {@code phraseSrcDir}. As a precondition all files in {@code phraseSrcDir} must be formatted according to the + * phrase file format as defined in {@code netspeak3-application-java-notes.txt}. + *

+ * .zip files will automatically be opened and processed. It's assumed that a .zip file contains only .csv file. + * + * @param outputDir A directory to store output phrase files. + * @param mappers A list of {@link PhraseMapper} objects. + * @throws IOException if any I/O errors occurs. + */ + public static PhraseSource process(PhraseSource input, Path outputDir, Collection mappers, + PreprocessingOptions options) throws Exception { + requireNonNull(input); + requireNonNull(outputDir); + requireNonNull(mappers); + requireNonNull(options); + + // make a copy of the procession options + options = new PreprocessingOptions(options); + long start = System.currentTimeMillis(); + + Util.createEmptyDirectory(outputDir); + + PhraseMapper[] mapperArray = mappers.toArray(new PhraseMapper[0]); + MapperStats[] stats = options.verbose ? createStats(mapperArray) : null; + + if (options.mergeDuplicates) { + Path tmp = outputDir.resolve("tmp"); + Util.createEmptyDirectory(tmp); + + // split all phrases by hash into different buckets such that duplicates are in + // the same bucket + try (SplitterCsvWriter writer = new SplitterCsvWriter(tmp, 1024)) { + System.out.println("Applying mappers."); + processAllFiles(options, input, file -> { + try (PhraseReader reader = file.createReader()) { + applyMappers(reader, writer, mapperArray, stats); + } + }); + } + + // use NetspeakCsvReader to read the output of SplitterNetspeakCsvWriter + SimplePhraseSource tmpSource = new SimplePhraseSource(tmp); + tmpSource.setReaderFactory(SimpleCsvReader::new); + + // delete temp files + options.setDeleteSource(DeleteMode.PROGRESSIVE); + + // use a simple HashMap to merge the duplicates + System.out.println("Merging phrases"); + AtomicLong totalPhrasesCount = new AtomicLong(0); + AtomicLong totalDuplicatesCount = new AtomicLong(0); + + processAllFiles(options, tmpSource, file -> { + Map map = new HashMap<>(); + try (PhraseReader reader = file.createReader()) { + long phrases = 0; + AtomicLong dups = new AtomicLong(0); + PhraseFrequencyPair pair; + while ((pair = reader.nextPair()) != null) { + phrases++; + map.merge(pair.phrase, pair.frequency, (a, b) -> { + dups.incrementAndGet(); + return a + b; + }); + } + totalPhrasesCount.addAndGet(phrases - dups.get()); + totalDuplicatesCount.addAndGet(dups.get()); + } + + // write map + Path out = outputDir.resolve(file.getPath().getFileName()); + try (SimpleCsvWriter writer = new SimpleCsvWriter(Files.newBufferedWriter(out, UTF_8))) { + for (Entry entry : map.entrySet()) { + writer.write(entry.getKey(), entry.getValue()); + } + } + }); + + double percentage = Math + .round(100. * 10. * totalDuplicatesCount.doubleValue() / totalPhrasesCount.doubleValue()) / 10.; + System.out.println("Total of " + totalPhrasesCount + " phrases with " + totalDuplicatesCount + " (" + + percentage + "%) duplicates merged."); + + // clean up + System.out.println("Deleting temporary directory"); + Files.delete(tmp); + } else { + + System.out.println("Applying mappers."); + processAllFiles(options, input, file -> { + String outFileName = file.getPath().getFileName().toString().replaceFirst("(?i).csv[^\\\\/]*", "") + + ".csv"; + Path out = outputDir.resolve(Paths.get(outFileName)); + try (PhraseReader reader = file.createReader(); + SimpleCsvWriter writer = new SimpleCsvWriter(Files.newBufferedWriter(out, UTF_8))) { + applyMappers(reader, writer, mapperArray, stats); + } + }); + } + + printStats(stats); + + long end = System.currentTimeMillis(); + System.out.println("Took " + readableDuration(Duration.ofMillis(end - start))); + System.out.println("Done."); + + return new SimplePhraseSource(outputDir); + } + + /** + * This will iterate over all phases just as {@link #process(PhraseSource, Path, Collection, PreprocessingOptions)} + * would but without changing the file system. + *

+ * All mappers can be thought of as consumers. + * + * @param mappers A list of {@link PhraseMapper} objects. + * @throws IOException if any I/O errors occurs. + */ + public static void iterate(PhraseSource input, Collection mappers, PreprocessingOptions options) + throws Exception { + requireNonNull(input); + requireNonNull(mappers); + requireNonNull(options); + + // make a copy of the procession options + options = new PreprocessingOptions(options); + options.setDeleteSource(DeleteMode.NONE); + long start = System.currentTimeMillis(); + + System.out.println("Applying mappers."); + PhraseMapper[] mapperArray = mappers.toArray(new PhraseMapper[0]); + MapperStats[] stats = options.verbose ? createStats(mapperArray) : null; + processAllFiles(options, input, file -> { + try (PhraseReader reader = file.createReader()) { + applyMappers(reader, null, mapperArray, stats); + } + }); + + printStats(stats); + + long end = System.currentTimeMillis(); + System.out.println("Took " + readableDuration(Duration.ofMillis(end - start))); + System.out.println("Done."); + } + + private static void processAllFiles(PreprocessingOptions options, PhraseSource input, ProcessAllConsumer consumer) + throws Exception { + ExecutorService executor = Executors.newFixedThreadPool(options.parallelDegree); + DeleteMode deleteSource = options.deleteSource; + try { + List> futures = new ArrayList<>(); + int i = 0; + Collection files = input.getFiles(); + for (final PhraseSource.File file : files) { + int currentIndex = ++i; + futures.add(executor.submit((ThrowsRunnable) () -> { + int percent = currentIndex * 100 / files.size(); + String prefix = "[" + new Date() + "][" + percent + "% " + currentIndex + "/" + files.size() + "] "; + System.out.println(prefix + "Preprocessing " + file); + + consumer.accept(file); + + if (deleteSource == DeleteMode.PROGRESSIVE) { + Files.delete(file.getPath()); + } + }, file.getPath())); + } + Util.getAll(futures); // wait for all tasks to complete + + if (deleteSource == DeleteMode.ATOMIC) { + for (final PhraseSource.File file : files) { + Files.delete(file.getPath()); + } + } + } finally { + executor.shutdown(); + executor.awaitTermination(100, DAYS); + } + } + + @FunctionalInterface + private interface ProcessAllConsumer { + + void accept(PhraseSource.File file) throws Exception; + + } + + private static void applyMappers(PhraseReader reader, PhraseWriter writer, PhraseMapper[] mappers, + MapperStats[] stats) throws Exception { + PhraseFrequencyPair pair; + while ((pair = reader.nextPair()) != null) { + String newPhrase = mapAll(pair.phrase, pair.frequency, mappers, stats); + if (newPhrase != null && writer != null) { + writer.write(newPhrase, pair.frequency); + } + } + } + + private static String mapAll(String phrase, long frequency, PhraseMapper[] mappers, MapperStats[] stats) { + if (stats == null) { + for (PhraseMapper mapper : mappers) { + if (phrase == null || phrase.isEmpty()) + return null; + phrase = mapper.map(phrase, frequency); + } + return phrase == null || phrase.isEmpty() ? null : phrase; + } else { + return mapAllWithStats(phrase, frequency, mappers, stats); + } + } + + private static String mapAllWithStats(String phrase, long frequency, PhraseMapper[] mappers, MapperStats[] stats) { + if (phrase == null || phrase.isEmpty()) + return null; + + for (int i = 0; i < mappers.length; i++) { + long start = System.nanoTime(); + String newPhrase = mappers[i].map(phrase, frequency); + long time = System.nanoTime() - start; + MapperStats s = stats[i]; + s.phrasesTotal.accumulate(1); + s.runTime.accumulate(time); + + if (newPhrase == null || newPhrase.isEmpty()) { + s.phrasesRemoved.accumulate(1); + return null; + } else { + if (phrase.contentEquals(newPhrase)) { + s.phrasesLeftUnchanged.accumulate(1); + } else { + s.phrasesChanged.accumulate(1); + phrase = newPhrase; + } + } + } + return phrase; + } + + private static MapperStats[] createStats(PhraseMapper[] mappers) { + MapperStats[] stats = new MapperStats[mappers.length]; + for (int i = 0; i < mappers.length; i++) { + stats[i] = new MapperStats(mappers[i]); + } + return stats; + } + + private static void printStats(MapperStats[] stats) { + if (stats == null) + return; + + System.out.println(); + for (MapperStats s : stats) { + long total = s.phrasesTotal.get(); + long changed = s.phrasesChanged.get(); + long kept = s.phrasesLeftUnchanged.get(); + long removed = s.phrasesRemoved.get(); + double runTime = s.runTime.get(); + + System.out.println("Mapper: " + s.mapper.getName()); + System.out.println(" total : " + padStart(total, 12)); + if (total > 0) { + double t = total; + System.out.println(" removed: " + padStart(removed, 12) + " (" + percent(removed / t, 2) + ")"); + System.out.println(" changed: " + padStart(changed, 12) + " (" + percent(changed / t, 2) + ")"); + System.out.println(" kept : " + padStart(kept, 12) + " (" + percent(kept / t, 2) + ")"); + System.out.println(" time/phrase: " + round(runTime / total, 2) + "ns/p"); + } + } + System.out.println(); + } + + private static String padStart(Object o, int length) { + String s = String.valueOf(o); + + if (s.length() >= length) + return s; + + char[] spaces = new char[length - s.length()]; + for (int i = 0; i < spaces.length; i++) { + spaces[i] = ' '; + } + + return new String(spaces) + s; + } + + private static String percent(double value, int precision) { + return round(value * 100, precision) + "%"; + } + + private static String round(double value, int precision) { + return BigDecimal.valueOf(value).setScale(precision, RoundingMode.HALF_UP).toString(); + } + + private static String readableDuration(Duration duration) { + return duration.toString().substring(2).replaceAll("(\\d[HMS])(?!$)", "$1 ").toLowerCase(); + } + + private static class MapperStats { + + public final PhraseMapper mapper; + public final LongAccumulator phrasesTotal = new LongAccumulator(Long::sum, 0); + public final LongAccumulator phrasesRemoved = new LongAccumulator(Long::sum, 0); + public final LongAccumulator phrasesChanged = new LongAccumulator(Long::sum, 0); + public final LongAccumulator phrasesLeftUnchanged = new LongAccumulator(Long::sum, 0); + /** + * The total run time of the mapper in ns. + */ + public final LongAccumulator runTime = new LongAccumulator(Long::sum, 0); + + public MapperStats(PhraseMapper mapper) { + this.mapper = mapper; + } + + } + +} diff --git a/src/main/java/org/netspeak/preprocessing/PreprocessingOptions.java b/src/main/java/org/netspeak/preprocessing/PreprocessingOptions.java new file mode 100644 index 0000000..d2cb2be --- /dev/null +++ b/src/main/java/org/netspeak/preprocessing/PreprocessingOptions.java @@ -0,0 +1,87 @@ +package org.netspeak.preprocessing; + +import static java.util.Objects.requireNonNull; + +public class PreprocessingOptions { + int parallelDegree = 1; + boolean mergeDuplicates = false; + DeleteMode deleteSource = DeleteMode.NONE; + boolean verbose = false; + + public PreprocessingOptions() { + } + + public PreprocessingOptions(PreprocessingOptions toCopy) { + parallelDegree = toCopy.parallelDegree; + mergeDuplicates = toCopy.mergeDuplicates; + deleteSource = toCopy.deleteSource; + verbose = toCopy.verbose; + } + + /** + * Sets the maximum number of concurrently processed files. + *

+ * This defaults {@code 1} meaning that files will processed in a single thread. + * + * @param parallelDegree + */ + public void setParallelDegree(int parallelDegree) { + this.parallelDegree = parallelDegree; + } + + /** + * Sets whether to merge duplicate phrases between and within files. + *

+ * This option is necessary if your phrases contain duplicates. + *

+ * This defaults to {@code false}. + * + * @param mergeDuplicates + */ + public void setMergeDuplicates(boolean mergeDuplicates) { + this.mergeDuplicates = mergeDuplicates; + } + + /** + * Sets whether the source files will be deleted after they were read. + *

+ * This option is useful to automatically remove temporary files. + *

+ * This defaults to {@link DeleteMode#NONE}. + * + * @param deleteSource + */ + public void setDeleteSource(DeleteMode deleteSource) { + this.deleteSource = requireNonNull(deleteSource); + } + + public enum DeleteMode { + /** + * No files will be deleted. + */ + NONE, + /** + * All files will be deleted at once after all files have been read. + */ + ATOMIC, + /** + * Files will be deleted as soon as possible. + */ + PROGRESSIVE + } + + /** + * Sets whether additional information about the preprocessing step should be + * logged in the console. + *

+ * Note: Enabling this might make the preprocessing slower. + *

+ * This defaults to {code false}. + * + * @param verbose + */ + public void setVerbose(boolean verbose) { + this.verbose = verbose; + } + +} diff --git a/src/main/java/org/netspeak/preprocessing/SimplePhraseSource.java b/src/main/java/org/netspeak/preprocessing/SimplePhraseSource.java new file mode 100644 index 0000000..3267865 --- /dev/null +++ b/src/main/java/org/netspeak/preprocessing/SimplePhraseSource.java @@ -0,0 +1,199 @@ +package org.netspeak.preprocessing; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static java.util.Objects.requireNonNull; + +import java.io.BufferedInputStream; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.file.DirectoryStream.Filter; +import java.nio.file.FileSystems; +import java.nio.file.FileVisitResult; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.PathMatcher; +import java.nio.file.Paths; +import java.nio.file.SimpleFileVisitor; +import java.nio.file.attribute.BasicFileAttributes; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.function.Function; +import java.util.zip.GZIPInputStream; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; + +import org.apache.commons.compress.compressors.CompressorStreamFactory; +import org.netspeak.io.PhraseReader; +import org.netspeak.io.SimpleCsvReader; + +public class SimplePhraseSource implements PhraseSource { + + final Path path; + Function readerFactory = SimpleCsvReader::new; + private Filter fileFilter; + + public SimplePhraseSource(Path path) { + this.path = requireNonNull(path); + } + + public SimplePhraseSource(String path) { + this.path = Paths.get(requireNonNull(path)); + } + + @Override + public String toString() { + return path.toString(); + } + + /** + * Sets the factory to create a new {@link PhraseReader} from the given + * {@link BufferedReader}. + *

+ * This defaults to {@code NetspeakCsvReader::new}. + * + * @param readerFactory + */ + public void setReaderFactory(Function readerFactory) { + this.readerFactory = requireNonNull(readerFactory); + } + + /** + * Sets a filter which decides whether a file will be processed. + *

+ * This defaults to {@code null} meaning that all files in the given directory + * will be processed. + * + * @param fileFilter + */ + public void setFileFilter(Filter fileFilter) { + this.fileFilter = fileFilter; + } + + /** + * Sets a glob pattern which decides whether a file will be processed. + *

+ * This defaults to {@code null} meaning that all files in the given directory + * will be processed. + * + * @param globPattern + */ + public void setFileFilter(String globPattern) { + if (globPattern == null) { + this.fileFilter = null; + } else { + final PathMatcher pathMatcher = FileSystems.getDefault().getPathMatcher(globPattern); + this.fileFilter = pathMatcher::matches; + } + } + + @Override + public Collection getFiles() throws Exception { + if (!Files.isDirectory(path)) { + throw new AssertionError("Not a directory " + path); + } + + List files = new ArrayList<>(); + SimplePhraseSource that = this; + + Files.walkFileTree(path, new SimpleFileVisitor() { + @Override + public FileVisitResult visitFile(Path path, BasicFileAttributes attrs) throws IOException { + if (fileFilter == null || fileFilter.accept(path)) { + files.add(new PhrasesSourceFile(that, path)); + } + return FileVisitResult.CONTINUE; + } + }); + + return files; + } + + private static class PhrasesSourceFile implements PhraseSource.File { + + private final SimplePhraseSource source; + private final Path path; + + public PhrasesSourceFile(SimplePhraseSource source, Path path) { + this.source = requireNonNull(source); + this.path = requireNonNull(path); + } + + @Override + public Path getPath() { + return path; + } + + @Override + public String toString() { + return path.toString(); + } + + @Override + public PhraseReader createReader() throws Exception { + BufferedReader br; + + String lowerPath = path.toString().toLowerCase(); + if (lowerPath.endsWith(".zip")) { + br = createZipReader(); + } else if (lowerPath.endsWith(".bz2")) { + br = createBz2Reader(); + } else if (lowerPath.endsWith(".gz")) { + br = createGZipReader(); + } else { + br = Files.newBufferedReader(path, UTF_8); + } + + try { + return source.readerFactory.apply(br); + } catch (Throwable e) { + br.close(); + throw e; + } + } + + private BufferedReader createZipReader() throws Exception { + // we assume that the .zip contains only one file which is a CSV file + BufferedInputStream bis = null; + ZipInputStream zip = null; + try { + bis = new BufferedInputStream(Files.newInputStream(path)); + zip = new ZipInputStream(bis); + ZipEntry entry = zip.getNextEntry(); + if (entry == null) { + throw new IllegalStateException("The .zip file is empty."); + } + if (!entry.getName().toLowerCase().endsWith(".csv")) { + throw new IllegalStateException("The .zip file is only allowed to contain a CSV file."); + } + return new BufferedReader(new InputStreamReader(zip, UTF_8)); + } catch (Throwable t) { + if (bis != null) + bis.close(); + if (zip != null) + zip.close(); + throw t; + } + } + + private BufferedReader createBz2Reader() throws Exception { + BufferedInputStream bis = null; + try { + bis = new BufferedInputStream(Files.newInputStream(path)); + return new BufferedReader( + new InputStreamReader(new CompressorStreamFactory().createCompressorInputStream(bis), UTF_8)); + } catch (Throwable t) { + if (bis != null) + bis.close(); + throw t; + } + } + + private BufferedReader createGZipReader() throws IOException { + return new BufferedReader(new InputStreamReader(new GZIPInputStream(Files.newInputStream(path)), UTF_8)); + } + + } + +} diff --git a/src/main/java/org/netspeak/preprocessing/VocabularyExtractor.java b/src/main/java/org/netspeak/preprocessing/VocabularyExtractor.java new file mode 100644 index 0000000..c3ed67f --- /dev/null +++ b/src/main/java/org/netspeak/preprocessing/VocabularyExtractor.java @@ -0,0 +1,99 @@ +package org.netspeak.preprocessing; + +import java.io.BufferedWriter; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.util.ArrayList; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.LongAccumulator; +import java.util.function.Function; + +import org.netspeak.io.PhraseFrequencyPair; +import org.netspeak.io.PhraseWriter; +import org.netspeak.io.SimpleCsvWriter; + +/** + * A phrase mapper that will create a vocabulary from all phrases it sees. + *

+ * This mapper will not change any phrases. + */ +public class VocabularyExtractor implements PhraseMapper { + + private Map vocabulary = new ConcurrentHashMap<>(); + private List list; + + private List getPairs() { + if (list == null) { + list = new ArrayList<>(); + vocabulary.forEach((phrase, counter) -> { + list.add(new PhraseFrequencyPair(phrase, counter.get())); + }); + vocabulary = null; + + list.sort((a, b) -> { + if (a.frequency > b.frequency) { + return -1; + } else if (a.frequency < b.frequency) { + return 1; + } + return a.phrase.compareTo(b.phrase); + }); + } + return list; + } + + @Override + public String map(String phrase, long frequency) { + for (String word : phrase.split(" ")) { + LongAccumulator counter = vocabulary.computeIfAbsent(word, key -> new LongAccumulator(Long::max, 0)); + counter.accumulate(frequency); + } + return phrase; + } + + public void writePairs(PhraseWriter writer) throws Exception { + for (PhraseFrequencyPair pair : getPairs()) { + writer.write(pair); + } + } + + public void writePairs(Path file) throws Exception { + writePairs(file, SimpleCsvWriter::new); + } + + public void writePairs(Path file, Function writerFactory) throws Exception { + try (BufferedWriter writer = Files.newBufferedWriter(file, StandardOpenOption.CREATE); + PhraseWriter phraseWriter = writerFactory.apply(writer)) { + writePairs(phraseWriter); + } + } + + public void writeVocabulary(BufferedWriter writer) throws IOException { + String newLine = "\n"; + for (PhraseFrequencyPair pair : getPairs()) { + writer.write(pair.phrase); + writer.write(newLine); + } + } + + public void writeVocabulary(Path file) throws IOException { + try (BufferedWriter writer = Files.newBufferedWriter(file, StandardOpenOption.CREATE)) { + writeVocabulary(writer); + } + } + + public Set getVocabulary() { + Set set = new LinkedHashSet<>(); + for (PhraseFrequencyPair pair : getPairs()) { + set.add(pair.phrase); + } + return set; + } + +} diff --git a/src/main/java/org/netspeak/usage/NetspeakBuilderUsage.java b/src/main/java/org/netspeak/usage/NetspeakBuilderUsage.java new file mode 100644 index 0000000..d553f3b --- /dev/null +++ b/src/main/java/org/netspeak/usage/NetspeakBuilderUsage.java @@ -0,0 +1,46 @@ +package org.netspeak.usage; + +import org.netspeak.Util; +import org.netspeak.preprocessing.ContractionMapper; +import org.netspeak.preprocessing.PhraseMapper; +import org.netspeak.preprocessing.PhraseMappers; +import org.netspeak.preprocessing.PhraseSource; +import org.netspeak.preprocessing.Pipeline; +import org.netspeak.preprocessing.Preprocessing; +import org.netspeak.preprocessing.PreprocessingOptions; +import org.netspeak.preprocessing.SimplePhraseSource; + +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Arrays; + +/** + * Demonstrates the usage of {@link Preprocessing}. + */ +public final class NetspeakBuilderUsage { + + public static void main(String[] args) throws Exception { + + Pipeline pipeline = new Pipeline(); + + pipeline.add(source -> { + Path output = Paths.get("C:\\Netspeak\\_out"); + + PreprocessingOptions options = new PreprocessingOptions(); + options.setParallelDegree(8); + options.setMergeDuplicates(true); + + Path conFile = Paths.get("D:\\netspeak\\contractions_eng.txt"); + ContractionMapper con = new ContractionMapper(conFile); + + PhraseMapper superBalcklist = PhraseMappers + .superBlacklist(Util.readWordList(Paths.get("D:\\netspeak\\super_blacklist.txt"))); + + return Preprocessing.process(source, output, Arrays.asList(superBalcklist, con), options); + }); + + SimplePhraseSource source1 = new SimplePhraseSource("C:\\Netspeak\\processed_corpora\\eng_ci_web+books"); + pipeline.apply(PhraseSource.combine(source1)); + } + +} diff --git a/src/main/java/org/netspeak/usage/NetspeakTerminal.java b/src/main/java/org/netspeak/usage/NetspeakTerminal.java new file mode 100755 index 0000000..92b299b --- /dev/null +++ b/src/main/java/org/netspeak/usage/NetspeakTerminal.java @@ -0,0 +1,61 @@ +package org.netspeak.usage; + +import java.io.OutputStreamWriter; +import java.io.PrintWriter; +import java.util.Scanner; + +import org.netspeak.Configuration; +import org.netspeak.Netspeak; +import org.netspeak.NetspeakUtil; +import org.netspeak.generated.NetspeakMessages.Request; +import org.netspeak.generated.NetspeakMessages.Response; + +/** + * Runs an interactive prompt to search Netspeak via command line. + */ +public class NetspeakTerminal { + + public static void main(String[] args) throws Exception { + + // --------------------------------------------------------------------- + // CONFIGURATION + // --------------------------------------------------------------------- + Configuration config = new Configuration(); + config.put(Configuration.PATH_TO_HOME, "/media/michael/Volume/data-in-production/netspeak/netspeak3-web-en"); + config.put(Configuration.CACHE_CAPACITY, "10000"); + + // --------------------------------------------------------------------- + // START NETSPEAK + // --------------------------------------------------------------------- + Netspeak netspeak = new Netspeak(config); + + // --------------------------------------------------------------------- + // TERMINAL INTERACTION + // --------------------------------------------------------------------- + PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out)); + + try (final Scanner scanner = new Scanner(System.in);) { + Request.Builder rb = Request.newBuilder(); + while (true) { + pw.print("\nEnter query (type 'q' to exit): "); + pw.flush(); + String query = scanner.nextLine(); + if (query.equals("q")) + break; + long start = System.currentTimeMillis(); + Request request = rb.setQuery(query).build(); + Response response = netspeak.search(request); + for (int i = 0; i != response.getPhraseCount(); ++i) { + System.out.printf("%-5d%-15d%s\n", i, response.getPhrase(i).getFrequency(), + NetspeakUtil.toString(response.getPhrase(i))); + } + System.out.println("Error code: " + response.getErrorCode()); + System.out.println("Error message: " + response.getErrorMessage()); + System.out.println("Tokenized query: " + String.join(" ", response.getQueryTokenList())); + System.out.println("Parsed query: " + NetspeakUtil.toString(response.getQuery())); + System.out.println("Time: " + (System.currentTimeMillis() - start)); + rb = request.toBuilder(); + } + } + } +} diff --git a/src/main/java/org/netspeak/usage/NetspeakUsage.java b/src/main/java/org/netspeak/usage/NetspeakUsage.java new file mode 100755 index 0000000..f8bb6aa --- /dev/null +++ b/src/main/java/org/netspeak/usage/NetspeakUsage.java @@ -0,0 +1,112 @@ +package org.netspeak.usage; + +import java.util.Map; + +import org.netspeak.Configuration; +import org.netspeak.ErrorCode; +import org.netspeak.Netspeak; +import org.netspeak.NetspeakUtil; +import org.netspeak.generated.NetspeakMessages.Request; +import org.netspeak.generated.NetspeakMessages.Response; + +import com.google.protobuf.InvalidProtocolBufferException; + +public class NetspeakUsage { + + public static void main(String[] args) { + + // --------------------------------------------------------------------- + // CONFIGURATION + // --------------------------------------------------------------------- + Configuration config = new Configuration(); + config.put(Configuration.PATH_TO_HOME, "/media/michael/Volume/data-in-production/netspeak/netspeak3-web-en"); + config.put(Configuration.CACHE_CAPACITY, "10000"); + + // --------------------------------------------------------------------- + // START NETSPEAK + // --------------------------------------------------------------------- + Netspeak netspeak = new Netspeak(config); + + // --------------------------------------------------------------------- + // SEARCH NETSPEAK + // --------------------------------------------------------------------- + Request.Builder rb = Request.newBuilder(); + rb.setQuery("programming is *"); + // Advanced parameters (optional) +// rb.setMaxPhraseCount(int); // default: 100 (find at most X n-grams) +// rb.setPhraseLengthMin(int); // default: 1 (minimum n-gram length) +// rb.setPhraseLengthMax(int); // default: 5 (maximum n-gram length) + + Request request = rb.build(); + Response response = null; + try { + response = netspeak.search(request); + } catch (InvalidProtocolBufferException e) { + e.printStackTrace(); + } + + // Tip: As you will see, there are no setter methods to prepare your + // request object with a new query for your next search. But you can + // and you should reuse your request object like that: + // request = request.toBuilder().setQuery("be efficient and ?").build(); + + // --------------------------------------------------------------------- + // ERROR HANDLING + // --------------------------------------------------------------------- + // A Netspeak search will never throw any exceptions. + // Errors are indicated by the response's error code. + System.out.println("Error: " + response.getErrorCode()); + switch (ErrorCode.fromCode(response.getErrorCode())) { + case NO_ERROR: + // ... + break; + case INVALID_QUERY: + // ... + break; + case SERVER_ERROR: + // ... + break; + case UNKNOWN_ERROR: + // ... + break; + } + + // You can also handle errors like this: + // if (ErrorCode.cast(response.getErrorCode()) != ErrorCode.NO_ERROR) + + // --------------------------------------------------------------------- + // READ RESPONSE + // --------------------------------------------------------------------- + // Returns the total frequency (100% basis) of the returned n-grams. + // This is not the same value as the sum of all n-gram frequencies. + System.out.println("Total frequency: " + response.getTotalFrequency()); + // Returns the tokenized query string produced by the query lexer. + System.out.println("Tokenized query: " + String.join(" ", response.getQueryTokenList())); + // Returns the parsed (valid) query produced by the query parser. + System.out.println("Parsed query: " + NetspeakUtil.toString(response.getQuery())); + // Returns the request object. + System.out.println("Request was: " + response.getRequest()); + + // Loop through the returned phrases + for (int i = 0; i != response.getPhraseCount(); ++i) { + System.out.printf("%-5d%-15d%s\n", i, response.getPhrase(i).getFrequency(), + response.getPhrase(i).toString()); + } + + // You can also iterate like that: + // for (Phrase phrase : response.getPhraseList()) { + // System.out.println(phrase); // Complete phrase in JSON style + // } + + // --------------------------------------------------------------------- + // NETSPEAK PROPERTIES (Some interesting values) + // --------------------------------------------------------------------- + try { + for (Map.Entry entry : netspeak.getProperties().entrySet()) { + System.out.println(entry); + } + } catch (InvalidProtocolBufferException e) { + e.printStackTrace(); + } + } +} diff --git a/src/main/java/org/netspeak/usage/PreprocessingUsage.java b/src/main/java/org/netspeak/usage/PreprocessingUsage.java new file mode 100644 index 0000000..b42f00f --- /dev/null +++ b/src/main/java/org/netspeak/usage/PreprocessingUsage.java @@ -0,0 +1,152 @@ +package org.netspeak.usage; + +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Arrays; + +import org.netspeak.Util; +import org.netspeak.Util.ThrowsRunnable; +import org.netspeak.preprocessing.ContractionMapper; +import org.netspeak.preprocessing.HyphenationJoiner; +import org.netspeak.preprocessing.Operations; +import org.netspeak.preprocessing.Operations.StandardOperationsOptions; +import org.netspeak.preprocessing.PhraseMappers; +import org.netspeak.preprocessing.PhraseSource; +import org.netspeak.preprocessing.Pipeline; +import org.netspeak.preprocessing.Preprocessing; +import org.netspeak.preprocessing.PreprocessingOptions; +import org.netspeak.preprocessing.PreprocessingOptions.DeleteMode; +import org.netspeak.preprocessing.SimplePhraseSource; + +public class PreprocessingUsage { + + /* + * You have to specify two temporary directories. + * + * Ideally, these should be your fastest storage capable of holding the whole + * data set of a pipeline. This means you can read the data from a HDD, process + * it on an SSD. + */ + + static Path temp1 = Paths.get("path/to/temp1"); + static Path temp2 = Paths.get("path/to/temp2"); + + public static void main(String[] args) throws Exception { + useTemp(() -> { + + PhraseSource german = new SimplePhraseSource("path/to/german/data"); + processGerman(german, Paths.get("out/german")); + + }); + } + + private static void useTemp(ThrowsRunnable runnable) throws Exception { + // clear temporary directories before and after pre-processing + Util.delete(temp1, true); + Util.delete(temp2, true); + try { + runnable.runThrowing(); + } finally { + Util.delete(temp1, true); + Util.delete(temp2, true); + } + } + + /** + * + * @throws Exception + */ + static void processGerman(PhraseSource source, Path outDir) throws Exception { + Pipeline pipeline = new Pipeline(); + + pipeline.add(() -> { + Path output = temp1; + + StandardOperationsOptions operationOptions = new StandardOperationsOptions(); + operationOptions.setSuperBlacklist(Util.readResourceWordList("super-blacklist.txt")); + operationOptions.setBlacklist(Util.readResourceWordList("blacklist.txt")); + operationOptions.setBlacklistCombinations(4); + operationOptions.setMaxNGram(5); + operationOptions.setToLowerCase(false); + + operationOptions.getAdditionalMappers() + .add(new ContractionMapper(Util.readResourceWordList("eng/contractions.txt"))); + + PreprocessingOptions options = new PreprocessingOptions(); + options.setParallelDegree(8); + + return Operations.standardOperations(output, operationOptions, options); + }); + + pipeline.add(() -> { + Path output = temp2; + + PreprocessingOptions options = new PreprocessingOptions(); + options.setParallelDegree(8); + options.setDeleteSource(DeleteMode.PROGRESSIVE); // delete files from temp + + HyphenationJoiner.German german = new HyphenationJoiner.German(); + german.setStopWordList(Util.readResourceWordList("ger/stop-words.txt")); + + return new HyphenationJoiner(german, output, options); + }); + + pipeline.add(Operations.moveTo(outDir)); + + pipeline.apply(source); + } + + static void processEnglish(PhraseSource source, Path outDir) throws Exception { + Pipeline pipeline = new Pipeline(); + + pipeline.add(() -> { + Path output = temp1; + + StandardOperationsOptions operationOptions = new StandardOperationsOptions(); + operationOptions.setSuperBlacklist(Util.readResourceWordList("super-blacklist.txt")); + operationOptions.setBlacklist(Util.readResourceWordList("blacklist.txt")); + operationOptions.setBlacklistCombinations(4); + operationOptions.setMaxNGram(5); + operationOptions.setToLowerCase(false); + + PreprocessingOptions options = new PreprocessingOptions(); + options.setParallelDegree(8); + + return Operations.standardOperations(output, operationOptions, options); + }); + + pipeline.add(() -> { + Path output = temp2; + + PreprocessingOptions options = new PreprocessingOptions(); + options.setParallelDegree(8); + options.setDeleteSource(DeleteMode.PROGRESSIVE); // delete files from temp + + HyphenationJoiner.English english = new HyphenationJoiner.English(); + + return new HyphenationJoiner(english, output, options); + }); + + pipeline.add(Operations.moveTo(outDir)); + + pipeline.apply(source); + } + + static void toLowerCase(PhraseSource source, Path outDir) throws Exception { + Pipeline pipeline = new Pipeline(); + + pipeline.add(inputSource -> { + Path output = temp2; + + PreprocessingOptions options = new PreprocessingOptions(); + options.setParallelDegree(8); + + return Preprocessing.process(inputSource, output, Arrays.asList(PhraseMappers.toLowerCase()), options); + }); + + pipeline.add(Operations.moveTo(outDir)); + + pipeline.apply(source); + } + +} diff --git a/src/main/resources/blacklist.txt b/src/main/resources/blacklist.txt new file mode 100644 index 0000000..96f9ba1 --- /dev/null +++ b/src/main/resources/blacklist.txt @@ -0,0 +1,67 @@ +' +" +„ +“ +„ +” +« +» +` +´ + +-- +--- +---- ++ += +~ +* +% +# +.. +... +.... +: +| +( +) +[ +] +{ +} +< +> +^ +@ +/ +\ +& + +$ +€ +£ +¥ + +Ã +¼ +¤ + +² +³ + +— +• +■ +¬ +→ +· +_ +… +® +© +█ +™ +♥ +Ȳ +¶ +± diff --git a/src/main/resources/eng/contractions.txt b/src/main/resources/eng/contractions.txt new file mode 100644 index 0000000..4c1baca --- /dev/null +++ b/src/main/resources/eng/contractions.txt @@ -0,0 +1,9 @@ +i'm +(he|she|it)'s +(you|we|they)'re + +(i|you|he|she|it|we|they)'(d|ll|ve) + +y'all + +(have|has|had|do|does|did|is|are|ai|was|were|wo|would|ca|could|sha|must|need)n't diff --git a/src/main/resources/ger/stop-words.txt b/src/main/resources/ger/stop-words.txt new file mode 100644 index 0000000..310c68e --- /dev/null +++ b/src/main/resources/ger/stop-words.txt @@ -0,0 +1,68 @@ +, +der +und +die +in +von +zu +den +mit +für +des +das +auf +nicht +im +sich +dem +eine +ein +! +an +auch +als +bei +? +oder +aus +nach +zum +einer +zur +wie +so +nur +über +durch +um +am +einen +aber +noch +mehr +einem +bis +dass +vor +daß +dieser +wenn +diese +vom +hier +unter +dann +was +keine +eines +ab +da +schon +sehr +diesem +sowie + +u. +bzw +bzw. + diff --git a/src/main/resources/super-blacklist.txt b/src/main/resources/super-blacklist.txt new file mode 100755 index 0000000..e69de29 diff --git a/src/test/java/org/netspeak/preprocessing/ContractionMapperTest.java b/src/test/java/org/netspeak/preprocessing/ContractionMapperTest.java new file mode 100644 index 0000000..00905da --- /dev/null +++ b/src/test/java/org/netspeak/preprocessing/ContractionMapperTest.java @@ -0,0 +1,94 @@ +package org.netspeak.preprocessing; + +import java.util.ArrayList; +import java.util.List; +import java.util.function.BiConsumer; + +import org.junit.Assert; +import org.junit.Test; + +public class ContractionMapperTest { + + public List getContractionPatterns() { + List patterns = new ArrayList<>(); + + patterns.add("i'm"); + patterns.add("(he|she|it)'s"); + patterns.add("(you|we|they)'re"); + patterns.add("(i|you|he|she|it|we|they)'(d|ll|ve)"); + patterns.add("y'all"); + patterns.add("(have|has|had|do|does|did|is|are|ai|was|were|wo|would|ca|could|sha|must|need)n't"); + + return patterns; + } + + @Test + public void contractionTest() { + final ContractionMapper mapper = new ContractionMapper(getContractionPatterns()); + + BiConsumer test = (from, to) -> { + String actual = mapper.map(from, 100); + if (actual == to) + return; + if (to == null || actual == null || !to.contentEquals(actual)) { + Assert.fail("\"" + from + "\" was expected to map to \"" + to + "\" but was actually mapped to \"" + + actual + "\"."); + } + }; + + test.accept("Tom", "Tom"); + test.accept("Tom's bar", "Tom's bar"); + test.accept("Tom 's bar", "Tom's bar"); + test.accept("Tom ' s bar", "Tom's bar"); + test.accept("Tom' s bar", "Tom's bar"); + test.accept("Tom s bar", "Tom s bar"); // too little context, so leave it as is + + test.accept("Charls' phone", "Charls' phone"); + test.accept("Charls ' phone", "Charls' phone"); + test.accept("Charls '", "Charls'"); + test.accept("Charls 't", "Charls 't"); + + test.accept("he's nice", "he's nice"); + test.accept("he' s nice", "he's nice"); + test.accept("he ' s nice", "he's nice"); + test.accept("he 's nice", "he's nice"); + test.accept("he s nice", "he's nice"); + + test.accept("we'll do it", "we'll do it"); + test.accept("we 'll do it", "we'll do it"); + test.accept("we ' ll do it", "we'll do it"); + test.accept("we' ll do it", "we'll do it"); + test.accept("we ll do it", "we'll do it"); + test.accept("well do it", "well do it"); // well well well + + test.accept("dont", "don't"); + test.accept("don't", "don't"); + test.accept("don 't", "don't"); + test.accept("don ' t", "don't"); + test.accept("don' t", "don't"); + test.accept("don t", "don't"); + + test.accept("DoNt", "DoN't"); + test.accept("DoN't", "DoN't"); + test.accept("DoN 't", "DoN't"); + test.accept("DoN ' t", "DoN't"); + test.accept("DoN' t", "DoN't"); + test.accept("DoN t", "DoN't"); + + test.accept("I'm", "I'm"); + test.accept("I 'm", "I'm"); + test.accept("I ' m", "I'm"); + test.accept("I' m", "I'm"); + test.accept("I m", "I'm"); + + test.accept("I might", "I might"); + + test.accept("won", "won"); + test.accept("won'", null); + test.accept("won '", null); + test.accept("'t open", null); + test.accept("' t open", null); + test.accept("t open", "t open"); // could be real + } + +} diff --git a/src/test/java/org/netspeak/preprocessing/PhraseMappersTest.java b/src/test/java/org/netspeak/preprocessing/PhraseMappersTest.java new file mode 100644 index 0000000..96c9339 --- /dev/null +++ b/src/test/java/org/netspeak/preprocessing/PhraseMappersTest.java @@ -0,0 +1,137 @@ +package org.netspeak.preprocessing; + +import static org.junit.Assert.assertEquals; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import org.junit.Test; + +public class PhraseMappersTest { + + private void phraseMapperTest(PhraseMapper mapper, Collection unchanged, Collection removed, + Map changed) { + String name = mapper.getName(); + + if (unchanged != null) { + for (String expected : unchanged) { + String actual = mapper.map(expected, 100); + assertEquals("Expected unchanged for " + name, expected, actual); + } + } + + if (removed != null) { + for (String expected : removed) { + String actual = mapper.map(expected, 100); + assertEquals("Expected removed for " + name, null, actual); + } + } + + if (changed != null) { + for (Map.Entry transform : changed.entrySet()) { + String actual = mapper.map(transform.getKey(), 100); + assertEquals("Expected changed for " + name, transform.getValue(), actual); + } + } + } + + @Test + public void blacklist() { + Set blacklistedWords = new HashSet<>(); + for (String word : ". - ( ) \" '".split(" ")) { + blacklistedWords.add(word); + } + + final Collection sharedUnchanged = new ArrayList<>(); + sharedUnchanged.add("foo bar"); + sharedUnchanged.add("foo-bar"); + sharedUnchanged.add("Dr."); + + final Collection sharedRemoved = new ArrayList<>(); + sharedRemoved.add("."); + sharedRemoved.add("."); + sharedRemoved.add("("); + sharedRemoved.add(")"); + sharedRemoved.add("-"); + sharedRemoved.add("foo -"); + sharedRemoved.add("- foo"); + sharedRemoved.add("- foo -"); + sharedRemoved.add("foo - bar"); + + { + final PhraseMapper mapper = PhraseMappers.blacklist(blacklistedWords, 1); + + final Collection unchanged = new ArrayList<>(sharedUnchanged); + unchanged.add("()"); + + final Collection removed = new ArrayList<>(sharedRemoved); + + phraseMapperTest(mapper, unchanged, removed, null); + } + { + final PhraseMapper mapper = PhraseMappers.blacklist(blacklistedWords, 4); + + final Collection unchanged = new ArrayList<>(); + unchanged.add("()()-"); + + final Collection removed = new ArrayList<>(); + removed.add("()()"); + removed.add("-.-."); + removed.add("-.-. foo"); + removed.add("foo -.-. foo"); + + phraseMapperTest(mapper, unchanged, removed, null); + } + } + + @Test + public void superBlacklist() { + Set blacklistedWords = new HashSet<>(); + for (String word : ". - ( ) \" '".split(" ")) { + blacklistedWords.add(word); + } + + final Collection sharedUnchanged = new ArrayList<>(); + sharedUnchanged.add("foo bar"); + sharedUnchanged.add("foo-bar"); + sharedUnchanged.add("Dr."); + + final Collection sharedRemoved = new ArrayList<>(); + sharedRemoved.add("."); + sharedRemoved.add("."); + sharedRemoved.add("("); + sharedRemoved.add(")"); + sharedRemoved.add("-"); + sharedRemoved.add("foo -"); + sharedRemoved.add("- foo"); + sharedRemoved.add("- foo -"); + sharedRemoved.add("foo - bar"); + + { + final PhraseMapper mapper = PhraseMappers.superBlacklist(blacklistedWords); + + final Collection unchanged = new ArrayList<>(); + unchanged.add("foo bar"); + + final Collection removed = new ArrayList<>(); + removed.add("."); + removed.add("."); + removed.add("("); + removed.add(")"); + removed.add("-"); + removed.add("foo -"); + removed.add("- foo"); + removed.add("- foo -"); + removed.add("foo - bar"); + + removed.add("foo-bar"); + removed.add("Dr."); + + phraseMapperTest(mapper, unchanged, removed, null); + } + } + +}