diff --git a/.gitignore b/.gitignore
new file mode 100755
index 0000000..dbfe783
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,9 @@
+bin
+build
+.gradle
+.ideaout/
+.idea/
+.settings/
+*.iml
+.project
+.classpath
diff --git a/LICENSE b/LICENSE
new file mode 100755
index 0000000..5eeca62
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT LICENSE
+
+Copyright (c) 2019 Webis group
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/README.md b/README.md
new file mode 100755
index 0000000..2e466b0
--- /dev/null
+++ b/README.md
@@ -0,0 +1,174 @@
+# Netspeak 4 indexing
+
+This project contains all necessities to create a new Netspeak 4 index.
+
+This project is mainly intended for developers that want to build a new Netspeak 4 index from a given data set.
+
+
+---
+
+## Contributors
+
+Michael Schmidt (2018 - 2020)
+
+Martin Trenkmann (2008 - 2013)
+
+Martin Potthast (2008 - 2020)
+
+Benno Stein (2008 - 2020)
+
+
+
+---
+
+# Old Notes
+
+% NETSPEAK 4 JAVA NOTES
+% martin.trenkmann@uni-weimar.de
+% November 22, 2013
+
+
+
+Notation
+--------
+
+    # <command>   Does need admin permissions (sudo).
+    $ <command>   Does not need admin permissions.
+
+
+Project description
+-------------------
+
+<http://www.uni-weimar.de/en/media/chairs/webis/research/projects/netspeak/>
+
+
+Library dependencies
+--------------------
+
+This Java project is a language binding for the C++ project netspeak4-application-cpp whose
+implementation comes in form of a shared library (.so file). The present Java
+application loads the library at runtime and invokes their native routines via
+the Java Native Interface (JNI) method. Precompiled libraries for Ubuntu 10.04
+and 12.04 can be found in the lib sub-directory of this project. The native
+library itself has some dependencies you need to install as well. To do so run
+the following script:
+
+    # <project>/build/install-dependencies.sh
+
+
+Build and install the native library
+------------------------------------
+
+In the case that there is no precompiled native library available for your
+platform, you need to compile the corresponding C++ project by yourself.
+
+- Checkout netspeak4-application-cpp from webis CVS.
+- Build target "Library" with Qt Creator IDE.
+
+# cp <project>/lib/<arch>/<lib>.so /usr/lib
+
+
+Load native library
+-------------------
+
+Set "-Djava.library.path=/usr/lib" as VM argument.
+
+
+Build Netspeak from n-gram collection
+-------------------------------------
+
+To build Netspeak from a collection of n-grams you have to provide a dedicated
+directory with one or more text files as input. Each of these files have to
+list a number of n-grams together with their frequencies, one by line. The
+format of a single line is defined as follows:
+
+    word_1 SPACE word_2 SPACE ... word_n TAB frequency
+
+In words: Each line defines an n-gram with its frequency. The delimiter between
+the n-gram and the frequency is a single tabulator ('\t'). The delimiter to
+separate the n-gram's words is a single whitespace (' ').
+
+Note: Follow this specification strictly to prevent parsing errors. In
+particular, ensure the single `\t` delimiter between n-gram and frequency.
+
+
+Getting Started
+---------------
+
+- `usage.NetspeakBuilderUsage.java` shows how to build Netspeak from a
+   collection of n-grams.
+- `usage.NetspeakTerminal.java` runs a simple command line to search a Netspeak
+   instance interactively for testing purposes.
+- `usage.NetspeakUsage.java` demonstrates how to search Netspeak in more detail
+   using the Request and Response objects.
+
+In some cases, if your local hardware, storage space or operating system
+(Netspeak runs only on Linux) does not fit, it might be necessary to setup
+Netspeak running on a Linux server and to request that instance remotely.
+
+For that reason build your Netspeak application as usual and run it as a Java
+servlet, e.g. with Tomcat, using the project `netspeak4-server`. A running
+Netspeak server can then be requested with `netspeak3-client-java` project from
+any Java application.
+
+
+Netspeak query language
+-----------------------
+
+The Netspeak query syntax as described here should be used as reference. There
+might be other syntax information out there, e.g. at netspeak.org, which
+provides some syntactical simplifications in form of easier to use wildcards or
+operators. However, these modified syntaxes are just front-ends and do not work
+with the original Netspeak interface. Here is the truth:
+
+    ?   is a placeholder for exactly one word and can be sequenced to search for
+        exaclty two, three, four ... words.
+
+        Example:    how to ? this
+                 -> how to use this
+                 -> how to do this
+                 -> how to cite this
+
+    *   is a placeholder for zero or many words.
+
+        Example:    see * works
+                 -> see how it works
+                 -> see if it works
+                 -> see what works
+
+    []  compares options, i.e. it checks each word or phrase between these
+        brackets plus the so called empty word at that position in the query.
+
+        Example:    it's [ great well "so good" ]
+                 -> it's
+                 -> it's great
+                 -> it's well
+                 -> it's so good
+
+    {}  checks the order, i.e. it tries to find each permutation of the given
+        sequence of words or phrases at that position in the query.
+
+        Example:    for { "very important people" only }
+                 -> for very important people only
+                 -> for only very important people
+
+    #   searches for alternatives of the word following. This operator requests
+        the optional Netspeak hash-dictionary component and uses [] to compare
+        each retrieved alternative (except that the empty word is not checked).
+        The mapping from word to alternatives is completely up to the user when
+        building Netspeak, for netspeak.org we use this operator for a synonym
+        search providing the Wordnet dictionary.
+
+        Example:    waiting for #response
+                 -> waiting for response
+                 -> waiting for answer
+                 -> waiting for reply
+
+You can combine the introduced wildcards and operators as you want, but with the
+exception that you may not place any wildcard within bracket operators. Also
+nested brackets are not allowed. As you can see in the examples above you can
+quote phrases to be handled as one entity is `[]` and `{}`.
+
+
+
+% Compile via: pandoc from.txt > to.html
diff --git a/artifactory.gradle b/artifactory.gradle
new file mode 100755
index 0000000..ff96511
--- /dev/null
+++ b/artifactory.gradle
@@ -0,0 +1,92 @@
+// Fetch Artifactory publishing plugin
+buildscript {
+    repositories {
+        jcenter()
+    }
+    dependencies {
+        classpath 'org.jfrog.buildinfo:build-info-extractor-gradle:4+'
+    }
+}
+
+// Apply plugins
+apply plugin: 'maven-publish'
+apply plugin: org.jfrog.gradle.plugin.artifactory.ArtifactoryPlugin
+
+// Determine which repositories to pull from and publish to
+def pullRelease  = 'libs-release'
+def pullSnapshot = 'libs-snapshot'
+def pushRelease  = 'libs-snapshot-webis-gradle'
+def pushSnapshot = 'libs-release-webis-gradle'
+
+if (project.ext.has("nonFree") && project.ext.get("nonFree")) {
+    pullRelease  += '-nonfree'
+    pullSnapshot += '-nonfree'
+    pushRelease  += '-nonfree'
+    pushSnapshot += '-nonfree'
+}
+
+repositories {
+    maven {
+        url = 'https://repo.webis.de/artifactory/' + pullRelease
+        credentials {
+            username = project.findProperty("artifactoryUsername") ?: ""
+            password = project.findProperty("artifactoryPassword") ?: ""
+        }
+    }
+    maven {
+        url = 'https://repo.webis.de/artifactory/' + pullSnapshot
+        credentials {
+            username = project.findProperty("artifactoryUsername") ?: ""
+            password = project.findProperty("artifactoryPassword") ?: ""
+        }
+    }
+}
+
+// Configure Artifactory remote
+artifactory {
+    contextUrl = "https://repo.webis.de/artifactory"
+    publish {
+        repository {
+            repoKey = version.endsWith('SNAPSHOT') ? pushRelease : pushSnapshot
+            username = project.findProperty("artifactoryUsername") ?: ""
+            password = project.findProperty("artifactoryPassword") ?: ""
+            maven = true
+        }
+        defaults {
+            publications('mavenJava')
+        }
+    }
+}
+
+// Create tasks for generating source and JavaDoc JARs
+task sourcesJar(type: Jar, dependsOn: classes) {
+    classifier = 'sources'
+    from sourceSets.main.allSource
+}
+
+task javadocJar(type: Jar, dependsOn: javadoc) {
+    classifier = 'javadoc'
+    from javadoc.destinationDir
+}
+
+artifacts {
+    archives javadocJar
+    archives sourcesJar
+}
+
+// Configure Maven Publishing Information
+publishing {
+    publications {
+        mavenJava(MavenPublication) {
+            // Publish binary, source, and JavaDoc JARs
+            from components.java
+            artifact sourcesJar
+            artifact javadocJar
+
+            // Set POM definition
+            if (project.ext.has("pomDef")) {
+                pom project.ext.get("pomDef")
+            }
+        }
+    }
+}
diff --git a/build.gradle b/build.gradle
new file mode 100755
index 0000000..339432d
--- /dev/null
+++ b/build.gradle
@@ -0,0 +1,53 @@
+// Apply plugins
+apply plugin: 'java'
+apply plugin: 'jacoco'
+apply plugin: 'application'
+
+// Basic configuration and settings for all (sub-)projects
+allprojects {
+    group = 'org.netspeak'
+    version = '1.0'
+    mainClassName = 'org.netspeak.usage.NetspeakTerminal'
+    sourceCompatibility = 1.8
+    targetCompatibility = 1.8
+
+    // Set source file encoding
+    compileJava.options.encoding = "UTF-8"
+    compileTestJava.options.encoding = "UTF-8"
+    javadoc.options.encoding = 'UTF-8'
+
+    // Declare global dependencies
+    dependencies {
+        compile group: 'org.netspeak', name: 'netspeak4-application-java', version: '1.0'
+        compile group: 'org.apache.commons', name: 'commons-compress', version: '1.19'
+        
+        testImplementation 'junit:junit:4.12'
+    }
+
+    // Set MANIFEST.MF contents
+    jar {
+        manifest {
+            attributes('Main-Class': mainClassName)
+        }
+    }
+}
+
+// Set POM definition
+project.ext.pomDef = {
+    name = 'Netspeak 4 stuff'
+    description = 'An application with lots of miscellaneous functionality related to Netspeak 4'
+    url = 'http://netspeak.org'
+    //licenses {
+    //    license {
+    //        name = 'The Apache License, Version 2.0'
+    //        url = 'http://www.apache.org/licenses/LICENSE-2.0.txt'
+    //    }
+    //}
+    organization {
+        name = 'Netspeak'
+        url = 'http://netspeak.org'
+    }
+}
+
+// Include Artifactory configuration
+apply from: 'artifactory.gradle'
diff --git a/gradle/wrapper/gradle-wrapper.jar b/gradle/wrapper/gradle-wrapper.jar
new file mode 100755
index 0000000..0d4a951
Binary files /dev/null and b/gradle/wrapper/gradle-wrapper.jar differ
diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties
new file mode 100755
index 0000000..37e7bf4
--- /dev/null
+++ b/gradle/wrapper/gradle-wrapper.properties
@@ -0,0 +1,6 @@
+#Thu Aug 02 10:12:04 CEST 2018
+distributionBase=GRADLE_USER_HOME
+distributionPath=wrapper/dists
+zipStoreBase=GRADLE_USER_HOME
+zipStorePath=wrapper/dists
+distributionUrl=https\://services.gradle.org/distributions/gradle-4.9-all.zip
diff --git a/gradlew b/gradlew
new file mode 100755
index 0000000..cccdd3d
--- /dev/null
+++ b/gradlew
@@ -0,0 +1,172 @@
+#!/usr/bin/env sh
+
+##############################################################################
+##
+##  Gradle start up script for UN*X
+##
+##############################################################################
+
+# Attempt to set APP_HOME
+# Resolve links: $0 may be a link
+PRG="$0"
+# Need this for relative symlinks.
+while [ -h "$PRG" ] ; do
+    ls=`ls -ld "$PRG"`
+    link=`expr "$ls" : '.*-> \(.*\)$'`
+    if expr "$link" : '/.*' > /dev/null; then
+        PRG="$link"
+    else
+        PRG=`dirname "$PRG"`"/$link"
+    fi
+done
+SAVED="`pwd`"
+cd "`dirname \"$PRG\"`/" >/dev/null
+APP_HOME="`pwd -P`"
+cd "$SAVED" >/dev/null
+
+APP_NAME="Gradle"
+APP_BASE_NAME=`basename "$0"`
+
+# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+DEFAULT_JVM_OPTS=""
+
+# Use the maximum available, or set MAX_FD != -1 to use that value.
+MAX_FD="maximum"
+
+warn () {
+    echo "$*"
+}
+
+die () {
+    echo
+    echo "$*"
+    echo
+    exit 1
+}
+
+# OS specific support (must be 'true' or 'false').
+cygwin=false
+msys=false
+darwin=false
+nonstop=false
+case "`uname`" in
+  CYGWIN* )
+    cygwin=true
+    ;;
+  Darwin* )
+    darwin=true
+    ;;
+  MINGW* )
+    msys=true
+    ;;
+  NONSTOP* )
+    nonstop=true
+    ;;
+esac
+
+CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
+
+# Determine the Java command to use to start the JVM.
+if [ -n "$JAVA_HOME" ] ; then
+    if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
+        # IBM's JDK on AIX uses strange locations for the executables
+        JAVACMD="$JAVA_HOME/jre/sh/java"
+    else
+        JAVACMD="$JAVA_HOME/bin/java"
+    fi
+    if [ ! -x "$JAVACMD" ] ; then
+        die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+    fi
+else
+    JAVACMD="java"
+    which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+fi
+
+# Increase the maximum file descriptors if we can.
+if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
+    MAX_FD_LIMIT=`ulimit -H -n`
+    if [ $? -eq 0 ] ; then
+        if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
+            MAX_FD="$MAX_FD_LIMIT"
+        fi
+        ulimit -n $MAX_FD
+        if [ $? -ne 0 ] ; then
+            warn "Could not set maximum file descriptor limit: $MAX_FD"
+        fi
+    else
+        warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
+    fi
+fi
+
+# For Darwin, add options to specify how the application appears in the dock
+if $darwin; then
+    GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
+fi
+
+# For Cygwin, switch paths to Windows format before running java
+if $cygwin ; then
+    APP_HOME=`cygpath --path --mixed "$APP_HOME"`
+    CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
+    JAVACMD=`cygpath --unix "$JAVACMD"`
+
+    # We build the pattern for arguments to be converted via cygpath
+    ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
+    SEP=""
+    for dir in $ROOTDIRSRAW ; do
+        ROOTDIRS="$ROOTDIRS$SEP$dir"
+        SEP="|"
+    done
+    OURCYGPATTERN="(^($ROOTDIRS))"
+    # Add a user-defined pattern to the cygpath arguments
+    if [ "$GRADLE_CYGPATTERN" != "" ] ; then
+        OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
+    fi
+    # Now convert the arguments - kludge to limit ourselves to /bin/sh
+    i=0
+    for arg in "$@" ; do
+        CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
+        CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
+
+        if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
+            eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
+        else
+            eval `echo args$i`="\"$arg\""
+        fi
+        i=$((i+1))
+    done
+    case $i in
+        (0) set -- ;;
+        (1) set -- "$args0" ;;
+        (2) set -- "$args0" "$args1" ;;
+        (3) set -- "$args0" "$args1" "$args2" ;;
+        (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
+        (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
+        (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
+        (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
+        (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
+        (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
+    esac
+fi
+
+# Escape application args
+save () {
+    for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
+    echo " "
+}
+APP_ARGS=$(save "$@")
+
+# Collect all arguments for the java command, following the shell quoting and substitution rules
+eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
+
+# by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
+if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
+  cd "$(dirname "$0")"
+fi
+
+exec "$JAVACMD" "$@"
diff --git a/gradlew.bat b/gradlew.bat
new file mode 100755
index 0000000..f955316
--- /dev/null
+++ b/gradlew.bat
@@ -0,0 +1,84 @@
+@if "%DEBUG%" == "" @echo off
+@rem ##########################################################################
+@rem
+@rem  Gradle startup script for Windows
+@rem
+@rem ##########################################################################
+
+@rem Set local scope for the variables with windows NT shell
+if "%OS%"=="Windows_NT" setlocal
+
+set DIRNAME=%~dp0
+if "%DIRNAME%" == "" set DIRNAME=.
+set APP_BASE_NAME=%~n0
+set APP_HOME=%DIRNAME%
+
+@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+set DEFAULT_JVM_OPTS=
+
+@rem Find java.exe
+if defined JAVA_HOME goto findJavaFromJavaHome
+
+set JAVA_EXE=java.exe
+%JAVA_EXE% -version >NUL 2>&1
+if "%ERRORLEVEL%" == "0" goto init
+
+echo.
+echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:findJavaFromJavaHome
+set JAVA_HOME=%JAVA_HOME:"=%
+set JAVA_EXE=%JAVA_HOME%/bin/java.exe
+
+if exist "%JAVA_EXE%" goto init
+
+echo.
+echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:init
+@rem Get command-line arguments, handling Windows variants
+
+if not "%OS%" == "Windows_NT" goto win9xME_args
+
+:win9xME_args
+@rem Slurp the command line arguments.
+set CMD_LINE_ARGS=
+set _SKIP=2
+
+:win9xME_args_slurp
+if "x%~1" == "x" goto execute
+
+set CMD_LINE_ARGS=%*
+
+:execute
+@rem Setup the command line
+
+set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
+
+@rem Execute Gradle
+"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
+
+:end
+@rem End local scope for the variables with windows NT shell
+if "%ERRORLEVEL%"=="0" goto mainEnd
+
+:fail
+rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
+rem the _cmd.exe /c_ return code!
+if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
+exit /b 1
+
+:mainEnd
+if "%OS%"=="Windows_NT" endlocal
+
+:omega
diff --git a/src/main/java/org/netspeak/Util.java b/src/main/java/org/netspeak/Util.java
new file mode 100755
index 0000000..5b01a05
--- /dev/null
+++ b/src/main/java/org/netspeak/Util.java
@@ -0,0 +1,238 @@
+package org.netspeak;
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.FileVisitResult;
+import java.nio.file.Files;
+import java.nio.file.LinkOption;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.SimpleFileVisitor;
+import java.nio.file.attribute.BasicFileAttributes;
+import java.util.ArrayList;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.concurrent.ExecutionException;
+import java.util.concurrent.Future;
+import java.util.function.Consumer;
+import java.util.function.Function;
+import java.util.function.Supplier;
+import java.util.regex.MatchResult;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import static java.util.Objects.requireNonNull;
+
+public final class Util {
+
+	private Util() {
+	}
+
+	/**
+	 * Deletes the given file or directory.
+	 * <p>
+	 * System links will not be followed. This will throw for non-empty directories. This operation will do nothing if
+	 * the given path does not exist.
+	 *
+	 * @param dirOrFile
+	 * @throws IOException
+	 */
+	public static void delete(Path dirOrFile) throws IOException {
+		delete(dirOrFile, false);
+	}
+
+	/**
+	 * Deletes the given file or directory (recursively).
+	 * <p>
+	 * System links will not be followed. This will throw for non-empty directories if not recursive. This operation
+	 * will do nothing if the given path does not exist.
+	 *
+	 * @param dirOrFile
+	 * @throws IOException
+	 */
+	public static void delete(Path dirOrFile, boolean recursive) throws IOException {
+		if (!recursive) {
+			Files.deleteIfExists(dirOrFile);
+		} else {
+			if (!Files.exists(dirOrFile, LinkOption.NOFOLLOW_LINKS)) {
+				return;
+			}
+			Files.walkFileTree(dirOrFile, new SimpleFileVisitor<Path>() {
+				@Override
+				public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException {
+					Files.delete(file);
+					return FileVisitResult.CONTINUE;
+				}
+
+				@Override
+				public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException {
+					Files.delete(dir);
+					return FileVisitResult.CONTINUE;
+				}
+			});
+		}
+	}
+
+	public static void createEmptyDirectory(Path dir) throws IOException {
+		requireNonNull(dir);
+		if (Files.isDirectory(dir)) {
+			if (Files.newDirectoryStream(dir).iterator().hasNext()) {
+				throw new AssertionError("Is not empty " + dir);
+			}
+		} else {
+			Files.createDirectories(dir);
+		}
+	}
+
+	public static <T> List<T> getAll(Iterable<Future<T>> futures) throws InterruptedException, ExecutionException {
+		List<T> values = new ArrayList<>();
+		for (Future<T> f : futures) {
+			values.add(f.get());
+		}
+		return values;
+	}
+
+	/**
+	 * Returns the
+	 *
+	 * @param path
+	 * @return
+	 * @throws IOException
+	 */
+	public static Set<String> readWordList(Path path) throws IOException {
+		try (FileInputStream fileIn = new FileInputStream(path.toFile());
+		     Reader in = new InputStreamReader(fileIn, StandardCharsets.UTF_8)) {
+			return readWordList(in);
+		}
+	}
+
+	public static Set<String> readWordList(Reader in) throws IOException {
+		try (BufferedReader bufferedReader = new BufferedReader(in)) {
+			Set<String> set = new LinkedHashSet<>();
+
+			bufferedReader.lines().forEach(word -> {
+				if (word == null || word.isEmpty())
+					return;
+
+				word = word.trim();
+				if (!word.isEmpty()) {
+					set.add(word);
+				}
+			});
+
+			return set;
+		}
+	}
+
+	public static Set<String> readWordList(String path) throws IOException {
+		return readWordList(Paths.get(path));
+	}
+
+	public static Set<String> readResourceWordList(String name) throws IOException {
+		try (InputStream input = Util.class.getResourceAsStream(name);
+		     Reader in = new InputStreamReader(input, StandardCharsets.UTF_8)) {
+			return readWordList(in);
+		}
+	}
+
+	public static String toPhrase(String[] words) {
+		if (words.length == 1)
+			return words[0];
+
+		StringBuilder sb = new StringBuilder();
+		sb.append(words[0]);
+
+		for (int i = 1; i < words.length; i++) {
+			sb.append(' ');
+			sb.append(words[i]);
+		}
+
+		return sb.toString();
+	}
+
+
+	/**
+	 * Replaces all occurrences of the given pattern in the given string with the string returned by the replacer
+	 * function.
+	 *
+	 * @param pattern
+	 * @param string
+	 * @param replacer
+	 * @return
+	 */
+	public static String replaceAll(Pattern pattern, String string, Function<MatchResult, String> replacer) {
+		Matcher matcher = pattern.matcher(string);
+
+		requireNonNull(replacer);
+		boolean result = matcher.find();
+		if (result) {
+			StringBuilder sb = new StringBuilder();
+			int last;
+			do {
+				String replacement = replacer.apply(matcher);
+				sb.append(replacement);
+				last = matcher.end();
+				result = matcher.find();
+			} while (result);
+			sb.append(string, last, string.length());
+			return sb.toString();
+		}
+		return string;
+	}
+
+
+	public interface ThrowsRunnable extends Runnable {
+
+		void runThrowing() throws Exception;
+
+		@Override
+		default void run() {
+			try {
+				runThrowing();
+			} catch (RuntimeException e) {
+				throw e;
+			} catch (Exception e) {
+				throw new RuntimeException(e);
+			}
+		}
+	}
+
+	public interface ThrowsConsumer<T> extends Consumer<T> {
+
+		void acceptThrowing(T t) throws Exception;
+
+		@Override
+		default void accept(T t) {
+			try {
+				acceptThrowing(t);
+			} catch (RuntimeException e) {
+				throw e;
+			} catch (Exception e) {
+				throw new RuntimeException(e);
+			}
+		}
+	}
+
+	public interface ThrowsSupplier<T> extends Supplier<T> {
+
+		T getThrowing() throws Exception;
+
+		@Override
+		default T get() {
+			try {
+				return getThrowing();
+			} catch (RuntimeException e) {
+				throw e;
+			} catch (Exception e) {
+				throw new RuntimeException(e);
+			}
+		}
+	}
+
+}
diff --git a/src/main/java/org/netspeak/io/GoogleBooksCsvReader.java b/src/main/java/org/netspeak/io/GoogleBooksCsvReader.java
new file mode 100644
index 0000000..1c15dee
--- /dev/null
+++ b/src/main/java/org/netspeak/io/GoogleBooksCsvReader.java
@@ -0,0 +1,130 @@
+package org.netspeak.io;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+
+/**
+ * A reader for Google books CSV files.
+ * <p>
+ * These files are a bit difficult to parse because the n-grams are also
+ * separated by years. So there will be many consecutive occurrences of the same
+ * phrase but for different years. This reader will automatically parse and
+ * aggregate these entries.
+ * <p>
+ * Example:
+ *
+ * <pre>
+ * collision such    2000    4     4     4
+ * collision such    2001    6     6     6
+ * collision such    2002    6     6     6
+ * collision such    2003    10    11    0
+ * collision such    2004    17    11    5
+ * collision such    2005    14    11    3
+ * collision such    2006    20    22    0
+ * collision such    2007    17    11    7
+ * collision such    2008    19    11    8
+ * </pre>
+ *
+ * These will all be parsed and aggregated into:
+ *
+ * <pre>
+ * <code>
+ * String phrase  = "collision such";
+ * long frequency = 113;
+ * </code>
+ * </pre>
+ *
+ * @author Michael Schmidt
+ *
+ */
+public class GoogleBooksCsvReader implements PhraseReader {
+
+	private final BufferedReader reader;
+	private String lastLine = null;
+
+	public GoogleBooksCsvReader(BufferedReader reader) {
+		this.reader = reader;
+	}
+
+	@Override
+	public PhraseFrequencyPair nextPair() throws IOException {
+		String line = lastLine == null ? reader.readLine() : lastLine;
+
+		MutablePhraseFrequencyPair pair = new MutablePhraseFrequencyPair(null, -1);
+		while (line != null && !parseLine(line, pair)) {
+			// we read lines until we find one which parses or arrive at the end
+			line = reader.readLine();
+		}
+		if (line == null || pair.phrase == null)
+			return null;
+
+		// aggregate the frequencies of the next lines which also have the current
+		// phrase
+		String currentPhrase = pair.phrase;
+		long currentFrequency = pair.frequency;
+
+		String nextLine;
+		while ((nextLine = reader.readLine()) != null) {
+			if (parseLine(nextLine, pair)) {
+				if (currentPhrase.contentEquals(pair.phrase)) {
+					currentFrequency += pair.frequency;
+				} else {
+					break;
+				}
+			}
+		}
+		lastLine = nextLine;
+
+		return new PhraseFrequencyPair(currentPhrase, currentFrequency);
+	}
+
+	/**
+	 * This parses a CSV line.
+	 * <p>
+	 * Returns {@code false} if the given line could not be parsed.
+	 */
+	private static boolean parseLine(String line, MutablePhraseFrequencyPair pair) {
+		// e.g. "circumvallate\t1978\t313\t215\t85"
+		// "The first line tells us that in 1978, the word "circumvallate" occurred 313
+		// times overall, on 215 distinct pages and in 85 distinct books."
+
+		// this operation will be done millions of times, so I want to avoid
+		// String#split
+
+		int firstTab = line.indexOf('\t', 0);
+		int secondTab = line.indexOf('\t', firstTab + 1);
+		int thirdTab = line.indexOf('\t', secondTab + 1);
+		if (firstTab == -1 || secondTab == -1 || thirdTab == -1)
+			return false;
+
+		// phrases sometimes have a trailing space, so we have to remove that
+		String phrase = line.substring(0, firstTab).trim();
+		// the empty string is not a valid phrase
+		if (phrase.isEmpty()) {
+			return false;
+		}
+
+		pair.phrase = phrase;
+		pair.frequency = Long.parseLong(line.substring(secondTab + 1, thirdTab));
+
+		return true;
+	}
+
+	@Override
+	public void close() throws IOException {
+		reader.close();
+	}
+
+	private static class MutablePhraseFrequencyPair {
+
+		public String phrase;
+		public long frequency;
+
+		public MutablePhraseFrequencyPair(final String phrase, final long frequency) {
+			this.phrase = phrase;
+			this.frequency = frequency;
+		}
+
+	}
+
+}
diff --git a/src/main/java/org/netspeak/io/PhraseFrequencyPair.java b/src/main/java/org/netspeak/io/PhraseFrequencyPair.java
new file mode 100644
index 0000000..a44ec25
--- /dev/null
+++ b/src/main/java/org/netspeak/io/PhraseFrequencyPair.java
@@ -0,0 +1,40 @@
+package org.netspeak.io;
+
+import static java.util.Objects.requireNonNull;
+
+public class PhraseFrequencyPair {
+
+	public final String phrase;
+	public final long frequency;
+
+	/**
+	 * Creates a new phrase frequency pair.
+	 *
+	 * @param phrase
+	 * @param frequency
+	 * @throws NullPointerException     if the given phrase is {@code null}.
+	 * @throws IllegalArgumentException if the given frequency is {@code <= 0}.
+	 */
+	public PhraseFrequencyPair(final String phrase, final long frequency) {
+		if (frequency <= 0) {
+			throw new IllegalArgumentException();
+		}
+		this.phrase = requireNonNull(phrase);
+		this.frequency = frequency;
+	}
+
+	@Override
+	public boolean equals(Object obj) {
+		if (obj instanceof PhraseFrequencyPair) {
+			PhraseFrequencyPair other = (PhraseFrequencyPair) obj;
+			return this.phrase.contentEquals(other.phrase) && this.frequency == other.frequency;
+		}
+		return false;
+	}
+
+	@Override
+	public int hashCode() {
+		return phrase.hashCode() ^ (int) frequency ^ (int) (frequency >>> 32);
+	}
+
+}
diff --git a/src/main/java/org/netspeak/io/PhraseReader.java b/src/main/java/org/netspeak/io/PhraseReader.java
new file mode 100644
index 0000000..ed8d31a
--- /dev/null
+++ b/src/main/java/org/netspeak/io/PhraseReader.java
@@ -0,0 +1,22 @@
+package org.netspeak.io;
+
+/**
+ * A interface for readers which return one phrase-frequency-pair at a time.
+ *
+ * @see GoogleBooksCsvReader
+ * @see SimpleCsvReader
+ *
+ * @author Michael
+ */
+public interface PhraseReader extends AutoCloseable {
+
+	/**
+	 * Returns the next phrase-frequency-pair or {@code null} if no other pairs will
+	 * be returned.
+	 *
+	 * @return
+	 * @throws Exception
+	 */
+	PhraseFrequencyPair nextPair() throws Exception;
+
+}
diff --git a/src/main/java/org/netspeak/io/PhraseWriter.java b/src/main/java/org/netspeak/io/PhraseWriter.java
new file mode 100644
index 0000000..9052ee1
--- /dev/null
+++ b/src/main/java/org/netspeak/io/PhraseWriter.java
@@ -0,0 +1,24 @@
+package org.netspeak.io;
+
+public interface PhraseWriter extends AutoCloseable {
+
+	/**
+	 * Writes the given phrase and frequency.
+	 *
+	 * @param phrase
+	 * @param frequency
+	 * @throws Exception
+	 */
+	void write(String phrase, long frequency) throws Exception;
+
+	/**
+	 * Writes the given phrase-frequency-pair.
+	 *
+	 * @param pair
+	 * @throws Exception
+	 */
+	default void write(PhraseFrequencyPair pair) throws Exception {
+		this.write(pair.phrase, pair.frequency);
+	}
+
+}
diff --git a/src/main/java/org/netspeak/io/SimpleCsvReader.java b/src/main/java/org/netspeak/io/SimpleCsvReader.java
new file mode 100644
index 0000000..a960cfe
--- /dev/null
+++ b/src/main/java/org/netspeak/io/SimpleCsvReader.java
@@ -0,0 +1,61 @@
+package org.netspeak.io;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+
+/**
+ * A reader for simple CSV files.
+ * <p>
+ * In these CSV files, every line ({@code \n}) contains a phrase followed by a
+ * single tab ({@code \t}) followed by the frequency of that phrase. There may
+ * be duplicate phrases. A phrase is a non-empty list of words each separated by
+ * a single whitespace ({@code \u0020}) with no leading or trailing spaces.
+ *
+ * <pre>
+ * hello world	20
+ * i love you	100
+ * hello world	5
+ * </pre>
+ *
+ * @author Michael Schmidt
+ *
+ */
+public class SimpleCsvReader implements PhraseReader {
+
+	private final BufferedReader reader;
+
+	public SimpleCsvReader(BufferedReader reader) {
+		this.reader = reader;
+	}
+
+	@Override
+	public PhraseFrequencyPair nextPair() throws IOException {
+		String line = reader.readLine();
+
+		if (line != null) {
+			// For better performance, we avoid String#split. Instead we know that a line
+			// only contains one \t, so we search for that index. To validate the format, we
+			// also search for a second \t. This is equivalent to:
+			// String[] parts = line.split("\t");
+			// if (parts.length == 2) { create the pair } else { null }
+			int firstTab = line.indexOf('\t');
+			int secondTab = line.indexOf('\t', firstTab + 1);
+
+			// The first tab has to exist and it cannot be 0 because the phrase cannot be
+			// the empty string. The second tab has to not exist.
+			if (firstTab > 0 && secondTab == -1) {
+				String phrase = line.substring(0, firstTab);
+				long frequency = Long.parseLong(line.substring(firstTab + 1));
+				return new PhraseFrequencyPair(phrase, frequency);
+			}
+		}
+
+		return null;
+	}
+
+	@Override
+	public void close() throws IOException {
+		reader.close();
+	}
+
+}
diff --git a/src/main/java/org/netspeak/io/SimpleCsvWriter.java b/src/main/java/org/netspeak/io/SimpleCsvWriter.java
new file mode 100755
index 0000000..e92e968
--- /dev/null
+++ b/src/main/java/org/netspeak/io/SimpleCsvWriter.java
@@ -0,0 +1,31 @@
+package org.netspeak.io;
+
+import java.io.BufferedWriter;
+import java.io.IOException;
+
+/**
+ * A writer for CSV files which can be understood by the Netspeak index builder.
+ * <p>
+ * For more details on the format see {@link SimpleCsvReader}.
+ *
+ * @author Michael Schmidt
+ */
+public class SimpleCsvWriter implements PhraseWriter {
+
+	private final BufferedWriter writer;
+
+	public SimpleCsvWriter(BufferedWriter writer) {
+		this.writer = writer;
+	}
+
+	@Override
+	public void write(String phrase, long frequency) throws IOException {
+		writer.append(phrase).append('\t').append(Long.toString(frequency)).append('\n');
+	}
+
+	@Override
+	public void close() throws IOException {
+		writer.close();
+	}
+
+}
diff --git a/src/main/java/org/netspeak/io/SplitterCsvWriter.java b/src/main/java/org/netspeak/io/SplitterCsvWriter.java
new file mode 100644
index 0000000..37cfa0f
--- /dev/null
+++ b/src/main/java/org/netspeak/io/SplitterCsvWriter.java
@@ -0,0 +1,85 @@
+package org.netspeak.io;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static java.nio.file.StandardOpenOption.CREATE_NEW;
+
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.nio.charset.CharsetEncoder;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+
+/**
+ * This CSV writer will create a given number of CSV files which will be used as
+ * buckets where phrases will be assigned a bucket according to their hash.
+ * These bags can then be used for further processing.
+ * <p>
+ * The {@link #write(String, long)} and {@link #write(PhraseFrequencyPair)}
+ * methods are thread-safe.
+ *
+ * @author Michael
+ *
+ */
+public class SplitterCsvWriter implements PhraseWriter {
+
+	private final SimpleCsvWriter[] writers;
+	private final Path destDir;
+	private boolean initialized = false;
+
+	public SplitterCsvWriter(Path destDir, int bucketCount) {
+		this.writers = new SimpleCsvWriter[bucketCount];
+		this.destDir = destDir;
+	}
+
+	@Override
+	public void close() throws Exception {
+		Exception last = null;
+
+		for (SimpleCsvWriter writer : writers) {
+			try {
+				if (writer != null)
+					writer.close();
+			} catch (Exception e) {
+				last = e;
+			}
+		}
+
+		if (last != null)
+			throw last;
+	}
+
+	@Override
+	public void write(String phrase, long frequency) throws IOException {
+		initializeWriters();
+
+		int index = phrase.hashCode() % writers.length;
+		if (index < 0)
+			index += writers.length;
+		SimpleCsvWriter writer = writers[index];
+		synchronized (writer) {
+			writer.write(phrase, frequency);
+		}
+	}
+
+	private final void initializeWriters() throws IOException {
+		if (initialized)
+			return;
+		synchronized (this) {
+			if (initialized)
+				return;
+
+			for (int i = 0; i < writers.length; i++) {
+				Path path = Paths.get(destDir.toString(), String.valueOf(i) + ".csv");
+				CharsetEncoder encoder = UTF_8.newEncoder();
+				Writer writer = new OutputStreamWriter(Files.newOutputStream(path, CREATE_NEW), encoder);
+				writers[i] = new SimpleCsvWriter(new BufferedWriter(writer, 1024 * 256));
+			}
+
+			initialized = true;
+		}
+	}
+
+}
diff --git a/src/main/java/org/netspeak/preprocessing/ContractionMapper.java b/src/main/java/org/netspeak/preprocessing/ContractionMapper.java
new file mode 100755
index 0000000..b3cfdeb
--- /dev/null
+++ b/src/main/java/org/netspeak/preprocessing/ContractionMapper.java
@@ -0,0 +1,246 @@
+package org.netspeak.preprocessing;
+
+import org.netspeak.Util;
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.function.Consumer;
+import java.util.regex.Pattern;
+
+import static java.util.Locale.ENGLISH;
+
+public class ContractionMapper implements PhraseMapper {
+
+	private final Pattern contractionPattern;
+	private final Pattern incompleteContractionPattern;
+	private final Map<String, Integer> knownContractionMap = new HashMap<>();
+	private static final Pattern POSSESSIVE_S_PATTERN = Pattern.compile("s '(?= |\\z)", Pattern.CASE_INSENSITIVE);
+
+	public ContractionMapper(Path file) throws IOException {
+		this(Util.readWordList(file));
+	}
+
+	public ContractionMapper(Iterable<String> knownContractions) {
+		StringBuilder pattern = new StringBuilder();
+		Set<String> incompleteContractionSuffixes = new HashSet<>();
+		Set<String> incompleteContractionPrefixes = new HashSet<>();
+
+		for (String known : knownContractions) {
+			// add know contractions ending with "n't" without ' to the map
+			// we can't do this for contractions like "we'll" or "i'm" because of the false
+			// positives
+			if (known.endsWith("n't")) {
+				for (String contraction : allCombinations(known)) {
+					int index = contraction.indexOf('\'');
+					knownContractionMap.put(contraction.replace("'", "").toLowerCase(ENGLISH), index);
+				}
+			}
+
+			// add prefixes and suffixes to lists
+			int apo = known.indexOf('\'');
+			incompleteContractionPrefixes.addAll(allCombinations(known.substring(0, apo)));
+			incompleteContractionSuffixes.addAll(allCombinations(known.substring(apo + 1)));
+
+			// make it all non-capturing for better performance
+			known = known.replace("\\((?!\\?)", "(?:");
+			// replace the ' with all variations
+			known = known.replace("'", "(?: '|' | ' | )");
+
+			pattern.append(known);
+			pattern.append('|');
+		}
+		pattern.append("[^\\s\\S]");
+
+		// contractionPattern
+		String finalPattern = "(?<= |\\A)(?:" + pattern.toString() + ")(?= |\\z)";
+		// n't can be fixed with minimal context as it's the only contraction with both
+		// prefix and suffix
+		finalPattern += "|n(?: '|' | ' )t(?= |\\z)";
+		// join possessive S
+		finalPattern += "|(?: '|' | ' )s(?= |\\z)";
+
+		contractionPattern = Pattern.compile(finalPattern, Pattern.CASE_INSENSITIVE);
+
+		// incompleteContractionPattern
+		incompleteContractionPrefixes.remove("");
+		incompleteContractionSuffixes.remove("");
+
+		StringBuilder incompletePattern = new StringBuilder();
+		incompletePattern.append("(?:\\A| )(?:");
+		incompletePattern.append(String.join("|", incompleteContractionPrefixes));
+		incompletePattern.append(")(?: ?'\\z)");
+		incompletePattern.append("|");
+		incompletePattern.append("(?:\\A' ?)(?:");
+		incompletePattern.append(String.join("|", incompleteContractionSuffixes));
+		incompletePattern.append(")(?: |\\z)");
+
+		incompleteContractionPattern = Pattern.compile(incompletePattern.toString(), Pattern.CASE_INSENSITIVE);
+	}
+
+
+	@Override
+	public String map(String phrase, long frequency) {
+		// phrases with incomplete contractions will be removed
+		if (incompleteContractionPattern.matcher(phrase).find()) {
+			return null;
+		}
+
+
+		phrase = Util.replaceAll(contractionPattern, phrase, match -> {
+			String m = match.group();
+			if (m.indexOf('\'') == -1) {
+				// e.g. "don t"
+				return m.replace(' ', '\'');
+			} else {
+				// e.g. "don' t" or "don 't" or "don ' t"
+				return m.replace(" ", "");
+			}
+		});
+
+		String[] words = phrase.split(" ");
+		boolean changed = false;
+		for (int i = 0; i < words.length; i++) {
+			String word = words[i];
+			String lowercase = word.toLowerCase(ENGLISH);
+			Integer ind = knownContractionMap.get(lowercase);
+			if (ind != null) {
+				int index = ind;
+				words[i] = word.substring(0, index) + '\'' + word.substring(index);
+				changed = true;
+			}
+		}
+
+		phrase = changed ? Util.toPhrase(words) : phrase;
+
+		phrase = POSSESSIVE_S_PATTERN.matcher(phrase).replaceAll("s'");
+
+		return phrase;
+	}
+
+	private static List<String> allCombinations(String pattern) {
+		List<Concatenation> alternatives = new ArrayList<>();
+		parseAlternation(pattern, 0, alternatives::add);
+
+		List<String> words = new ArrayList<>();
+		for (Concatenation concat : alternatives) {
+			List<StringBuilder> builders = new ArrayList<>();
+			builders.add(new StringBuilder());
+			addCombinations(builders, concat);
+			builders.forEach(b -> words.add(b.toString()));
+		}
+
+		return words;
+	}
+
+	private static void addCombinations(List<StringBuilder> builders, Concatenation concat) {
+		for (Element e : concat.getElements()) {
+			if (e instanceof Literal) {
+				String value = ((Literal) e).toString();
+				builders.forEach(b -> b.append(value));
+			} else {
+				List<Concatenation> alternatives = ((Alternation) e).getConcatenations();
+				List<StringBuilder> original = new ArrayList<>(builders);
+				builders.clear();
+				for (Concatenation alternative : alternatives) {
+					List<StringBuilder> newBuilders = new ArrayList<>();
+					original.forEach(b -> newBuilders.add(new StringBuilder(b)));
+					addCombinations(newBuilders, alternative);
+					builders.addAll(newBuilders);
+				}
+			}
+		}
+	}
+
+	private static int parseAlternation(String pattern, final int startIndex, Consumer<Concatenation> consumeConcat) {
+		int index = startIndex;
+
+		List<Element> concat = new ArrayList<>();
+
+		while (index < pattern.length()) {
+			char c = pattern.charAt(index++);
+			if (c == ')')
+				break;
+			if (c == '(') {
+				List<Concatenation> alternatives = new ArrayList<>();
+				index += parseAlternation(pattern, index, alternatives::add);
+				if (alternatives.size() == 1) {
+					concat.addAll(alternatives.get(0).getElements());
+				} else {
+					concat.add(new Alternation(alternatives));
+				}
+			} else if (c == '|') {
+				consumeConcat.accept(new Concatenation(concat));
+				concat = new ArrayList<>();
+			} else {
+				boolean added = false;
+				if (!concat.isEmpty()) {
+					Element last = concat.get(concat.size() - 1);
+					if (last instanceof Literal) {
+						((Literal) last).append(c);
+						added = true;
+					}
+				}
+				if (!added) {
+					concat.add(new Literal(c));
+				}
+			}
+		}
+
+		consumeConcat.accept(new Concatenation(concat));
+
+		return index - startIndex;
+	}
+
+	private interface Element {
+	}
+
+	private static class Concatenation {
+		private final List<Element> elements;
+
+		public Concatenation(List<Element> elements) {
+			this.elements = elements;
+		}
+
+		public List<Element> getElements() {
+			return elements;
+		}
+
+	}
+
+	private static class Literal implements Element {
+		private String value;
+
+		public Literal(char value) {
+			this.value = Character.toString(value);
+		}
+
+		public void append(char c) {
+			this.value += c;
+		}
+
+		@Override
+		public String toString() {
+			return value;
+		}
+
+	}
+
+	private static class Alternation implements Element {
+		private final List<Concatenation> concatenations;
+
+		public Alternation(List<Concatenation> concatenations) {
+			this.concatenations = concatenations;
+		}
+
+		public List<Concatenation> getConcatenations() {
+			return concatenations;
+		}
+	}
+
+}
diff --git a/src/main/java/org/netspeak/preprocessing/HyphenationJoiner.java b/src/main/java/org/netspeak/preprocessing/HyphenationJoiner.java
new file mode 100755
index 0000000..e27a2f9
--- /dev/null
+++ b/src/main/java/org/netspeak/preprocessing/HyphenationJoiner.java
@@ -0,0 +1,337 @@
+package org.netspeak.preprocessing;
+
+import org.netspeak.Util;
+import org.netspeak.Util.ThrowsConsumer;
+
+import java.io.IOException;
+import java.io.PrintStream;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Path;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Set;
+
+import static java.util.Objects.requireNonNull;
+
+/**
+ * This will join all hyphenated words in two phases.
+ * <p>
+ * In the first pass it will iterate over all input phrases and extract the vocabulary and stop words. This will be done
+ * according to the given options.
+ * <p>
+ * The second phase is specific to the joiner set. Generally, it will try to normalize and, where possible, join
+ * hyphenated words.
+ *
+ * @see German
+ */
+public class HyphenationJoiner implements PipelineItem {
+
+	private Path logFile;
+	private ThrowsConsumer<VocabularyExtractor> vocabularyConsumer;
+
+	private final Joiner joiner;
+	private final Path output;
+	private final PreprocessingOptions options;
+
+	public HyphenationJoiner(JoinerFactory joinerFactory, Path output, PreprocessingOptions options) throws Exception {
+		this.joiner = requireNonNull(joinerFactory).create();
+		this.output = requireNonNull(output);
+		this.options = requireNonNull(options);
+	}
+
+	/**
+	 * Sets an optional log file.
+	 * <p>
+	 * Every action the method takes will be logged here in order where each line is one operation. The result and the
+	 * reason for not joining will be in the following format:
+	 *
+	 * <pre>
+	 * {action}:[ {result}:] {phrase}
+	 * </pre>
+	 * <p>
+	 * The actions are and results are implementation defined and might be different for each language specific joiner.
+	 * See the implementation of the current joiner for more information.
+	 */
+	public void setLogger(Path logFile) {
+		this.logFile = logFile;
+	}
+
+	public void setVocabularyConsumer(ThrowsConsumer<VocabularyExtractor> vocabularyConsumer) {
+		this.vocabularyConsumer = vocabularyConsumer;
+	}
+
+	@Override
+	public PhraseSource apply(PhraseSource source) throws Exception {
+		// Pass 1
+
+		if (joiner.getRequiresVocabulary()) {
+			System.out.println("Extracting vocabulary...");
+
+			VocabularyExtractor vocabExtractor = new VocabularyExtractor();
+			Preprocessing.iterate(source, Arrays.asList(vocabExtractor), options);
+
+			System.out.println("Preparing vocabulary...");
+			if (vocabularyConsumer != null) {
+				vocabularyConsumer.accept(vocabExtractor);
+			}
+
+			Set<String> vocabulary = vocabExtractor.getVocabulary();
+			vocabExtractor = null;
+			System.gc();
+
+			joiner.setVocabulary(vocabulary);
+		}
+
+		System.out.println("Joining Hyphenations...");
+		options.setMergeDuplicates(true); // this operation is going to create duplicates
+
+		// We use Java 8, so we have to give it a charset name, so it can lookup the charset instance. In never version
+		// you can give it an instance directly.
+		String charsetName = StandardCharsets.UTF_8.name();
+		final PrintStream logger = logFile == null ? null : new PrintStream(logFile.toFile(), charsetName);
+		try {
+			joiner.setLogger(logger);
+
+			return Preprocessing.process(source, output, Arrays.asList(joiner), options);
+		} finally {
+			if (logger != null) {
+				logger.close();
+			}
+		}
+	}
+
+	public interface Joiner extends PhraseMapper {
+		boolean getRequiresVocabulary();
+
+		void setVocabulary(Set<String> vocabulary);
+
+		void setLogger(PrintStream logger);
+	}
+
+	public interface JoinerFactory {
+		Joiner create() throws Exception;
+	}
+
+	public static class German implements JoinerFactory {
+		/**
+		 * The top k words from the vocabulary will be treated as stop words. This will set the k.
+		 */
+		int stopWordsTopK = 100;
+		/**
+		 * An optional stop word list.
+		 * <p>
+		 * This list will be merged with the top k stop words from the vocabulary.
+		 */
+		Collection<String> stopWordList = null;
+
+		public void setStopWordsTopK(int stopWordsTopK) {
+			this.stopWordsTopK = stopWordsTopK;
+		}
+
+		public void setStopWordList(Path stopWordList) throws IOException {
+			this.stopWordList = Util.readWordList(stopWordList);
+		}
+
+		public void setStopWordList(Collection<String> stopWordList) {
+			this.stopWordList = stopWordList;
+		}
+
+		@Override
+		public Joiner create() throws Exception {
+			return new GermanJoiner(this);
+		}
+	}
+
+	private static class GermanJoiner implements Joiner {
+
+		private final German options;
+
+		private Set<String> vocabulary;
+		private Set<String> stopWords = new HashSet<>();
+
+		private PrintStream logger;
+
+		public GermanJoiner(German options) throws IOException {
+			this.options = requireNonNull(options);
+			if (options.stopWordList != null)
+				stopWords.addAll(options.stopWordList);
+		}
+
+		@Override
+		public void setVocabulary(Set<String> vocabulary) {
+			this.vocabulary = vocabulary;
+			HyphenationJoiner.addTopK(stopWords, vocabulary, options.stopWordsTopK);
+		}
+
+		@Override
+		public void setLogger(PrintStream logger) {
+			this.logger = logger;
+		}
+
+		@Override
+		public boolean getRequiresVocabulary() {
+			return true;
+		}
+
+		private String[] normalizeHyphens(String[] words, String phrase) {
+			if (words.length < 2)
+				return words;
+
+			int toRemove = 0;
+			for (int i = 1; i < words.length; i++) {
+				if ("-".contentEquals(words[i]))
+					toRemove++;
+			}
+
+			if (toRemove == 0)
+				return words;
+
+			String[] newWords = new String[words.length - toRemove];
+			newWords[0] = words[0];
+			int writeIndex = 1;
+			for (int i = 1; i < words.length; i++) {
+				String word = words[i];
+				if ("-".contentEquals(words[i])) {
+					newWords[writeIndex - 1] = newWords[writeIndex - 1] + "-";
+				} else {
+					newWords[writeIndex++] = word;
+				}
+			}
+
+			if (logger != null) {
+				logger.println("Normalize: " + Util.toPhrase(newWords) + ": " + phrase);
+			}
+
+			return newWords;
+		}
+
+		private String[] joinHyphen(String[] words, String phrase) {
+			/**
+			 * For all pairs matching the pattern `{words1}- {words2}`, we want to transform
+			 * them to either `{words1}{words2}`, `{words1}-{words2}`, or leave them as is.
+			 */
+
+			for (int i = 0; i < words.length - 1; i++) {
+				String word = words[i];
+				String next = words[i + 1];
+				if (word.length() > 1 && word.charAt(word.length() - 1) == '-') {
+
+					// if the next word is a stop word, we leave it as is.
+					if (stopWords.contains(next)) {
+						if (logger != null) {
+							logger.println("Stop word: " + next + ": " + phrase);
+						}
+						continue;
+					}
+
+					String result = null;
+
+					/**
+					 * To do the word join {word1}{word2}, 3 criteria have to be met:
+					 *
+					 * 1. {word2} can't be a stop word. <br>
+					 * 2. {word2} has to begin with a lower case letter. <br>
+					 * 3. The concatenation {word1}{word2} has to be a known word.
+					 */
+
+					if (Character.isLowerCase(next.charAt(0))) {
+						String concat = word.substring(0, word.length() - 1) + next;
+						if (vocabulary.contains(concat)) {
+							result = concat;
+							if (logger != null) {
+								logger.println("Full join: " + concat + ": " + phrase);
+							}
+						}
+					}
+
+					words[i] = null;
+					words[i + 1] = result == null ? word + next : result;
+				}
+			}
+
+			return HyphenationJoiner.removeNull(words);
+		}
+
+		@Override
+		public String map(String phrase, long frequency) {
+			if (phrase.indexOf('-') == -1)
+				return phrase;
+
+			String[] words = normalizeHyphens(phrase.split(" "), phrase);
+
+			words = joinHyphen(words, phrase);
+
+			return Util.toPhrase(words);
+		}
+
+	}
+
+	public static class English implements JoinerFactory {
+		@Override
+		public Joiner create() throws Exception {
+			return new EnglishJoiner();
+		}
+	}
+
+	private static class EnglishJoiner implements Joiner {
+
+		private PrintStream logger;
+
+		@Override
+		public void setVocabulary(Set<String> vocabulary) {
+			throw new UnsupportedOperationException();
+		}
+
+		@Override
+		public void setLogger(PrintStream logger) {
+			this.logger = logger;
+		}
+
+		@Override
+		public boolean getRequiresVocabulary() {
+			return false;
+		}
+
+		@Override
+		public String map(String phrase, long frequency) {
+			if (phrase.indexOf(" - ") == -1)
+				return phrase;
+
+			String newPhrase = phrase.replace(" - ", "-");
+			if (logger != null) {
+				logger.println("Join: " + newPhrase + ": " + phrase);
+			}
+
+			return newPhrase;
+		}
+
+	}
+
+	private static String[] removeNull(String[] words) {
+		int nullEntries = 0;
+		for (String word : words)
+			if (word == null)
+				nullEntries++;
+
+		if (nullEntries == 0)
+			return words;
+
+		String[] newWords = new String[words.length - nullEntries];
+		int writeIndex = 0;
+		for (String w : words) {
+			if (w != null)
+				newWords[writeIndex++] = w;
+		}
+		return newWords;
+	}
+
+	private static <T> void addTopK(Collection<T> consumer, Collection<T> supplier, int k) {
+		for (T item : supplier) {
+			if (k-- <= 0)
+				break;
+			consumer.add(item);
+		}
+	}
+
+}
diff --git a/src/main/java/org/netspeak/preprocessing/Operations.java b/src/main/java/org/netspeak/preprocessing/Operations.java
new file mode 100644
index 0000000..1d71602
--- /dev/null
+++ b/src/main/java/org/netspeak/preprocessing/Operations.java
@@ -0,0 +1,198 @@
+package org.netspeak.preprocessing;
+
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+
+import org.netspeak.Util;
+
+public abstract class Operations {
+
+	private Operations() {
+	}
+
+	public static PipelineItem standardOperations(Path output, StandardOperationsOptions operationOptions,
+			PreprocessingOptions options) {
+		return source -> {
+			List<PhraseMapper> mappers = new ArrayList<>();
+
+			// try to remove as much junk as possible
+			// In this phase, phrases will only be removed and not altered.
+			mappers.add(PhraseMappers.removeControlCharacters());
+			if (operationOptions.superBlacklist != null) {
+				mappers.add(PhraseMappers.superBlacklist(operationOptions.superBlacklist));
+			}
+			mappers.add(PhraseMappers.removeGoogleWebMarkers());
+			mappers.add(PhraseMappers.removeHTMLEntities());
+			mappers.add(PhraseMappers.removeURLsAndEmails());
+			mappers.add(PhraseMappers.removeFileNames());
+
+			// Normalization phase
+			mappers.add(PhraseMappers.explodeCommas());
+			mappers.add(PhraseMappers.removeLeadingDoubleQuote());
+			mappers.add(PhraseMappers.joinWordsWithLeadingApostrophe());
+
+			if (operationOptions.blacklist != null) {
+				mappers.add(
+						PhraseMappers.blacklist(operationOptions.blacklist, operationOptions.blacklistCombinations));
+			}
+			if (operationOptions.maxNGram < Integer.MAX_VALUE) {
+				mappers.add(PhraseMappers.maxNGram(operationOptions.maxNGram));
+			}
+			if (operationOptions.toLowerCase) {
+				mappers.add(PhraseMappers.toLowerCase());
+			}
+
+			if (operationOptions.additionalMappers != null) {
+				mappers.addAll(operationOptions.additionalMappers);
+			}
+
+			// the above operations are going to produce duplicates
+			options.setMergeDuplicates(true);
+
+			return Preprocessing.process(source, output, mappers, options);
+		};
+	}
+
+	public static class StandardOperationsOptions {
+		/**
+		 * The maximum allowed number of words per phrase.
+		 */
+		int maxNGram = Integer.MAX_VALUE;
+		/**
+		 * Whether all phrases should be lower-cased.
+		 */
+		boolean toLowerCase = false;
+		/**
+		 * All phrases with at least one word which can be constructed from at most
+		 * {@link #blacklistCombinations} many blacklisted word will be removed.
+		 */
+		Collection<String> blacklist = null;
+		int blacklistCombinations = 4;
+		/**
+		 * @see PhraseMappers#superBlacklist(Iterable)
+		 */
+		Collection<String> superBlacklist = null;
+		/**
+		 * Additional mappers which will be executed after the mappers defined by the
+		 * method.
+		 */
+		List<PhraseMapper> additionalMappers = new ArrayList<>();
+
+		public void setBlacklist(Path blacklist) throws IOException {
+			this.blacklist = Util.readWordList(blacklist);
+		}
+
+		public void setBlacklist(Collection<String> blacklist) {
+			this.blacklist = blacklist;
+		}
+
+		public void setBlacklistCombinations(int blacklistCombinations) {
+			this.blacklistCombinations = blacklistCombinations;
+		}
+
+		public void setSuperBlacklist(Path superBlacklist) throws IOException {
+			this.superBlacklist = Util.readWordList(superBlacklist);
+		}
+
+		public void setSuperBlacklist(Collection<String> superBlacklist) {
+			this.superBlacklist = superBlacklist;
+		}
+
+		public List<PhraseMapper> getAdditionalMappers() {
+			return additionalMappers;
+		}
+
+		public void setToLowerCase(boolean toLowerCase) {
+			this.toLowerCase = toLowerCase;
+		}
+
+		public void setMaxNGram(int maxNGram) {
+			this.maxNGram = maxNGram;
+		}
+	}
+
+	/**
+	 * Moves all files to the given directory.
+	 *
+	 * @param output The directory to move to.
+	 * @return
+	 */
+	public static PipelineItem moveTo(Path output) {
+		return source -> {
+			Path dest = output.toAbsolutePath();
+			System.out.println("Moving to " + dest);
+			System.out.println("From:");
+			System.out.println(source);
+
+			Util.createEmptyDirectory(dest);
+			List<PhraseSource> newSources = new ArrayList<>();
+			moveTo(newSources, source, dest);
+
+			System.out.println("Done.");
+
+			if (newSources.size() == 1) {
+				return newSources.get(0);
+			} else {
+				return PhraseSource.combine(newSources);
+			}
+		};
+	}
+
+	/**
+	 * Moves all files to the given directory.
+	 *
+	 * @param output The directory to move to.
+	 * @return
+	 */
+	public static PipelineItem moveTo(String output) {
+		return moveTo(Paths.get(output));
+	}
+
+	private static void moveTo(List<PhraseSource> out, PhraseSource source, Path dest) throws Exception {
+		if (source instanceof PhraseSource.Combined) {
+			for (PhraseSource s : ((PhraseSource.Combined) source).getSources()) {
+				moveTo(out, s, dest);
+			}
+		} else if (source instanceof SimplePhraseSource) {
+			SimplePhraseSource simple = (SimplePhraseSource) source;
+			// actually move some files
+			for (PhraseSource.File file : simple.getFiles()) {
+				Files.move(file.getPath(), dest.resolve(file.getPath().getFileName()));
+			}
+
+			SimplePhraseSource newSource = new SimplePhraseSource(dest);
+			newSource.setReaderFactory(simple.readerFactory);
+			out.add(newSource);
+		} else {
+			throw new UnsupportedOperationException(
+					"Cannot move files of unknown source class " + source.getClass().getName());
+		}
+	}
+
+	/**
+	 * Deletes all files of the input phrase source.
+	 * <p>
+	 * The item will return {@link PhraseSource#EMPTY}.
+	 *
+	 * @return
+	 */
+	public static PipelineItem delete() {
+		return source -> {
+			System.out.println("Deleting:");
+			System.out.println(source);
+
+			for (PhraseSource.File file : source.getFiles()) {
+				Files.delete(file.getPath());
+			}
+
+			System.out.println("Done.");
+			return PhraseSource.EMPTY;
+		};
+	}
+
+}
diff --git a/src/main/java/org/netspeak/preprocessing/PhraseMapper.java b/src/main/java/org/netspeak/preprocessing/PhraseMapper.java
new file mode 100755
index 0000000..908bdc7
--- /dev/null
+++ b/src/main/java/org/netspeak/preprocessing/PhraseMapper.java
@@ -0,0 +1,77 @@
+package org.netspeak.preprocessing;
+
+/**
+ * An interface providing a {@link #map(String, long)} function that transforms
+ * a given phrase. This interface can be used to apply certain string operations
+ * on phrases, such as case conversion or removal. Filter instances can be
+ * organized in some sort of collection to be applied one by one on the same
+ * phrase.
+ */
+@FunctionalInterface
+public interface PhraseMapper {
+
+	/**
+	 * Maps a given input {@code phrase} to some output phrase. The returned phrase
+	 * may be {@code null} or the empty string in which case the phrase will be
+	 * removed from the corpus.
+	 * <p>
+	 * The returned phrase is <b>not allowed</b> to contain tabs, line breaks,
+	 * adjacent spaces, and leading or trailing spaces.
+	 *
+	 * @param phrase    The input phrase string. This is guaranteed to not be
+	 *                  {@code null} and to not be the empty string.
+	 * @param frequency The phrase frequency.
+	 * @return The filtered phrase string.
+	 */
+	String map(String phrase, long frequency);
+
+	/**
+	 * The name of the PhraseMapper.
+	 * <p>
+	 * This name can be useful for diagnostics and will be used by
+	 * {@link Preprocessing} when printing information about a {@link PhraseMapper}.
+	 * By default this will be the name of the class of the mapper.
+	 *
+	 * @return
+	 */
+	default String getName() {
+		return getClass().getName();
+	}
+
+	/**
+	 * Returns a new {@link PhraseMapper} which behaves like the given
+	 * {@link PhraseMapper} and with the name of the full name of the caller method.
+	 *
+	 * @param mapper
+	 * @return
+	 */
+	static PhraseMapper rename(PhraseMapper mapper) {
+		StackTraceElement[] stack = Thread.currentThread().getStackTrace();
+		StackTraceElement caller = stack[2];
+		return rename(caller.getClassName() + "." + caller.getMethodName(), mapper);
+	}
+
+	/**
+	 * Returns a new {@link PhraseMapper} with the given name which behaves like the
+	 * given {@link PhraseMapper}.
+	 *
+	 * @param name
+	 * @param mapper
+	 * @return
+	 */
+	static PhraseMapper rename(String name, PhraseMapper mapper) {
+		return new PhraseMapper() {
+
+			@Override
+			public String map(String phrase, long frequency) {
+				return mapper.map(phrase, frequency);
+			}
+
+			@Override
+			public String getName() {
+				return name;
+			}
+		};
+	}
+
+}
diff --git a/src/main/java/org/netspeak/preprocessing/PhraseMappers.java b/src/main/java/org/netspeak/preprocessing/PhraseMappers.java
new file mode 100755
index 0000000..74557e9
--- /dev/null
+++ b/src/main/java/org/netspeak/preprocessing/PhraseMappers.java
@@ -0,0 +1,436 @@
+package org.netspeak.preprocessing;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+import java.util.function.Predicate;
+import java.util.regex.Pattern;
+
+/**
+ * Some common {@link PhraseMapper} to be used in {@link Preprocessing}.
+ */
+public final class PhraseMappers {
+
+	/**
+	 * Returns a new {@link PhraseMapper} which converts phrases to lower case.
+	 * <p>
+	 * Example: <code>"You'll make it"</code> becomes <code>"you'll make it"</code>
+	 * </p>
+	 *
+	 * @return
+	 */
+	public static PhraseMapper toLowerCase() {
+		return PhraseMapper.rename((phrase, frequency) -> phrase.toLowerCase());
+	}
+
+	/**
+	 * Returns a new {@link PhraseMapper} which removes one leading double quote
+	 * from a word.
+	 * <p>
+	 * Example: <code>"fo"o ""bar"</code> will become <code>fo"o "bar"</code> and
+	 * <code>" foo</code> will stay <code>" foo</code>
+	 * </p>
+	 *
+	 * @return
+	 */
+	public static PhraseMapper removeLeadingDoubleQuote() {
+		return PhraseMapper.rename((phrase, frequency) -> LEADING_DOUBLE_QUOTE_PATTERN.matcher(phrase).replaceAll(""));
+	}
+
+	private static final Pattern LEADING_DOUBLE_QUOTE_PATTERN = Pattern
+			.compile("(?:(?!\\G)|\\A)(?:\\A|(?<= ))\"(?=[^ ])");
+
+	/**
+	 * Returns a new {@link PhraseMapper} which joins two consecutive words within
+	 * the phrase if the second word starts with an apostrophe.
+	 * <p>
+	 * Example: <code>"You 'll make it"</code> will become
+	 * <code>"You'll make it"</code> and <code>"don 't"</code> will become
+	 * <code>"don't"</code>
+	 * </p>
+	 *
+	 * @return
+	 */
+	public static PhraseMapper joinWordsWithLeadingApostrophe() {
+		return PhraseMapper.rename((phrase, frequency) -> phrase.replace(" '", "'"));
+	}
+
+	/**
+	 * Returns a new {@link PhraseMapper} that removes phrases which contain at
+	 * least one word that is contained in a given blacklist vocabulary.
+	 *
+	 * @param words
+	 * @return
+	 */
+	public static PhraseMapper blacklist(final Collection<String> words) {
+		return PhraseMapper.rename(blacklist(words, 1));
+	}
+
+	/**
+	 * Returns a new {@link PhraseMapper} that removes phrases which contain at
+	 * least one word that is an
+	 * <a href="https://developer.mozilla.org/en-US/docs/Glossary/Entity">HTML
+	 * entity</a>.
+	 *
+	 * @return
+	 */
+	public static PhraseMapper removeHTMLEntities() {
+		return PhraseMapper.rename(filterByWords(w -> !(w.charAt(0) == '&' && w.charAt(w.length() - 1) == ';')));
+	}
+
+	/**
+	 * Removes all control characters.
+	 *
+	 * See
+	 * <a href="https://en.wikipedia.org/wiki/Control_character#In_Unicode">here</a>
+	 * for more details.
+	 *
+	 * @return
+	 */
+	public static PhraseMapper removeControlCharacters() {
+		return PhraseMapper.rename((phrase, freq) -> {
+			int l = phrase.length();
+			for (int i = 0; i < l; i++) {
+				char c = phrase.charAt(i);
+				if (c < ' ') // \x00 - \x1F
+					return null;
+				if (0x7F <= c && c <= 0x9F) // DEL, \x80 - \x9F
+					return null;
+			}
+			return phrase;
+		});
+	}
+
+	/**
+	 * Returns a new {@link PhraseMapper} that removes phrases which contain at
+	 * least one word that is contained in a given blacklist vocabulary.
+	 * <p>
+	 * Phrases which contains a word which can be constructed by concatenating
+	 * {@code <= repeating} many words from the blacklist will also be removed. I.e.
+	 * if {@code "} and {@code ?} are in the blacklist and {@code repeating} is 3,
+	 * then {@code """}, {@code "?"}, {@code "?}, and {@code ??} will all be
+	 * removed.
+	 * <p>
+	 * Please note that the blacklist will consume <b>{@code O(n ** repeat)}</b>
+	 * many bytes of memory where {@code n} is the number of blacklist entries.
+	 *
+	 * @param words
+	 * @return
+	 */
+	public static PhraseMapper blacklist(final Collection<String> words, int repeat) {
+		HashSet<String> tempBlacklist = new HashSet<>();
+		tempBlacklist.addAll(words);
+
+		// just to be safe
+		tempBlacklist.remove(null);
+		tempBlacklist.remove("");
+
+		if (repeat > 1) {
+			tempBlacklist = new HashSet<>(getAllCombinations(tempBlacklist, repeat));
+		}
+
+		// thanks Java
+		final Set<String> blacklist = tempBlacklist;
+
+		return PhraseMapper.rename(filterByWords(w -> !blacklist.contains(w)));
+	}
+
+	private static List<String> getAllCombinations(Collection<String> words, int repeat) {
+		ArrayList<String> combinations = new ArrayList<>((int) Math.pow(words.size(), repeat));
+		combinations.addAll(words);
+
+		int start = 0;
+		for (; repeat > 1; repeat--) {
+			int size = combinations.size();
+			for (int i = start; i < size; i++) {
+				for (String word : words) {
+					combinations.add(combinations.get(i) + word);
+				}
+			}
+			start = size;
+		}
+
+		return combinations;
+	}
+
+	/**
+	 * Returns a new {@link PhraseMapper} that removes phrases which contain at
+	 * least one word that is not contained in an given whitelist vocabulary.
+	 *
+	 * @param words
+	 * @return
+	 */
+	public static PhraseMapper whitelist(final Iterable<String> words) {
+		final Set<String> whitelist = new HashSet<>();
+		for (String word : words)
+			whitelist.add(word);
+
+		return PhraseMapper.rename(filterByWords(whitelist::contains));
+	}
+
+	/**
+	 * Returns a {@link PhraseMapper} which filters out all words for which the
+	 * given predicate returns {@code false}.
+	 *
+	 * @param wordPredicate
+	 * @return
+	 */
+	public static PhraseMapper filterByWords(final Predicate<String> wordPredicate) {
+		return PhraseMapper.rename((phrase, frequency) -> {
+			for (String word : phrase.split(" ")) {
+				if (!wordPredicate.test(word)) {
+					return null;
+				}
+			}
+			return phrase;
+		});
+	}
+
+	/**
+	 * Similar to {@link PhraseMappers#blacklist(Collection)} with the difference
+	 * being that all phrase which contain any of the given strings anywhere will be
+	 * removed.
+	 * <p>
+	 * E.g. A super blacklist with the string {@code "--"} will remove the phrase
+	 * {@code "foo--bar"} while a normal blacklist will not.
+	 *
+	 * @param strings
+	 * @return
+	 */
+	public static PhraseMapper superBlacklist(final Iterable<String> strings) {
+		StringMatcherNode matcher = StringMatcherNode.createRoot(strings);
+		return PhraseMapper.rename((phrase, freq) -> {
+			int l = phrase.length();
+			for (int i = 0; i < l; i++) {
+				if (matcher.matches(phrase, i)) {
+					return null;
+				}
+			}
+			return phrase;
+		});
+	}
+
+	private static class StringMatcherNode {
+
+		private static final StringMatcherNode ACCEPT = new StringMatcherNode(true);
+
+		private final StringMatcherNode[] next;
+
+		private StringMatcherNode(boolean accept) {
+			next = accept ? null : new StringMatcherNode[65536];
+		}
+
+		public boolean matches(String s, int index) {
+			if (this == ACCEPT)
+				return true;
+
+			StringMatcherNode node = this;
+			int length = s.length();
+			for (int i = index; i < length; i++) {
+				if (node == ACCEPT)
+					return true;
+
+				int c = s.charAt(i);
+				node = node.next[c];
+				if (node == null)
+					return false;
+			}
+			return node == ACCEPT;
+		}
+
+		public static StringMatcherNode createRoot(final Iterable<String> words) {
+			final StringMatcherNode root = new StringMatcherNode(false);
+
+			for (String word : words) {
+				int length = word.length();
+				if (length == 0)
+					return ACCEPT;
+
+				StringMatcherNode node = root;
+				for (int i = 0; i < length; i++) {
+					int c = word.charAt(i);
+					if (i + 1 == length) {
+						node.next[c] = ACCEPT;
+					} else {
+						StringMatcherNode current = node.next[c];
+						if (current == ACCEPT)
+							break;
+						if (current == null)
+							current = node.next[c] = new StringMatcherNode(false);
+						node = current;
+					}
+				}
+			}
+
+			return root;
+		}
+
+	}
+
+	/**
+	 * Returns a new {@link PhraseMapper} that removes phrases whose frequency is
+	 * less than a given minimum frequency.
+	 *
+	 * @return
+	 */
+	public static PhraseMapper removeIfFrequencyIsLessThan(final long minimumFrequency) {
+		return PhraseMapper.rename((phrase, frequency) -> frequency < minimumFrequency ? null : phrase);
+	}
+
+	/**
+	 * Returns a new {@link PhraseMapper} that removes phrases that contain at least
+	 * one character that is not included in the Latin-1 character set (ISO/IEC
+	 * 8859-1). The Latin-1 character set contains all characters with code points
+	 * in the range [0, 255]. ASCII is a subset of Latin-1 that covers the range [0,
+	 * 127]. Since Latin-1 characters are encoded in 8 bit they are full compatible
+	 * with languages that use simple 1-byte character types such as C or C++. You
+	 * need to apply this filter as long as the native Netspeak C++ implementation
+	 * has no built-in Unicode support.
+	 *
+	 * @return
+	 */
+	public static PhraseMapper removeIfContainsNonLatin1Chars() {
+		final int maxLatin1CodePoint = 255;
+
+		return PhraseMapper.rename((phrase, frequency) -> {
+			for (int i = 0; i != phrase.length(); ++i) {
+				if (phrase.codePointAt(i) > maxLatin1CodePoint) {
+					return null;
+				}
+			}
+			return phrase;
+		});
+	}
+
+	/**
+	 * Returns a new {@link PhraseMapper} that removes phrases that contain URLs or
+	 * email addresses.
+	 *
+	 * @return
+	 */
+	public static PhraseMapper removeURLsAndEmails() {
+		return PhraseMapper.rename((phrase, frequency) -> {
+			String lower = phrase.toLowerCase();
+
+			// check for Email addresses
+			if (EMAIL_PATTERN.matcher(lower).find())
+				return null;
+			// matches the URL pattern
+			if (URL_PATTERN.matcher(lower).find())
+				return null;
+
+			return phrase;
+		});
+	}
+
+	// Email addresses can be right about anything which contains an @.
+	private static final Pattern EMAIL_PATTERN = Pattern.compile(".@.");
+	private static final String ALL_COUNTRY_TLD = "a[cdefgilmoqrstuwxz]|b[abdefghijmnorstwyz]|c[acdfghiklmnoruvwxyz]|d[ejkmoz]|e[cegrstu]|f[ijkmor]|g[adefghilmnpqrstuwy]|h[kmnrtu]|i[delmnoqrst]|j[emop]|k[eghimnprwyz]|l[abcikrstuvy]|m[acdeghklmnopqrstuvwxyz]|n[acefgilopruz]|om|p[aefghklmnrstwy]|qa|r[eosuw]|s[abcdeghiklmnorstuvxyz]|t[cdfghjklmnortvwz]|u[agksyz]|v[aceginu]|w[fs]|y[et]|z[amw]";
+	// some of the more common domains
+	// https://w3techs.com/technologies/overview/top_level_domain/all
+	private static final Pattern URL_PATTERN = Pattern
+			.compile("www\\.|https?:|ftps?:|\\.(?:com|org|net|edu|gov|xyz|moe|club|online|pro|site|top|shop|info|biz|"
+					+ ALL_COUNTRY_TLD + ")\\b");
+
+	/**
+	 * Returns a new {@link PhraseMapper} that removes phrases that contain URLs or
+	 * email addresses.
+	 *
+	 * @return
+	 */
+	public static PhraseMapper removeFileNames() {
+		return PhraseMapper.rename((phrase, frequency) -> {
+			String lower = phrase.toLowerCase();
+
+			if (FILE_NAME_PATTERN.matcher(lower).find())
+				return null;
+
+			return phrase;
+		});
+	}
+
+	private static final Pattern FILE_NAME_PATTERN = Pattern.compile(
+			"\\.(?:exe|dll|bin|msi|bat|com|jar|pkg|apk|ini|ai|ico|jpg|jpeg|png|gif|bmp|webp|tif|tag|ps|odp|pps|ppt|pptx|pdf|doc|docx|xml|csv|sql|zip|rar|tar|gz|7z|iso|webm|mov|mkv|mpg|mpeg|mp3|acc|ogg|wav|wmv|mid|midi|mp4|avi|vlc|html|htm|php|asp|aspx|js|css)\\b");
+
+	/**
+	 * This removes all phrases with additional markers in the Google web corpus.
+	 * This includes: {@code <s>}, {@code <S>}, {@code </s>}, {@code </S>},
+	 * {@code </unk>}, and {@code </UNK>}.
+	 *
+	 * @return
+	 */
+	public static PhraseMapper removeGoogleWebMarkers() {
+		return PhraseMapper.rename(blacklist(Arrays.asList("<s>", "<S>", "</s>", "</S>", "<unk>", "<UNK>")));
+	}
+
+	/**
+	 * This will make surrounding commas some words have its own word.
+	 * <p>
+	 *
+	 * <pre>
+	 * "foo," -&gt; "foo ,"
+	 * ",foo,," -&gt; ", foo, ,"
+	 * </pre>
+	 *
+	 * @return
+	 */
+	public static PhraseMapper splitSurroundingCommas() {
+		return PhraseMapper.rename((phrase, freq) -> {
+			String[] words = phrase.split(" ");
+			for (int i = 0; i < words.length; i++) {
+				String word = words[i];
+				int l = word.length();
+				if (l > 1 && (word.charAt(0) == ',' || word.charAt(l - 1) == ',')) {
+					if (word.contentEquals(",,")) {
+						words[i] = ", ,";
+					} else {
+						if (word.charAt(0) == ',') {
+							word = ", " + word.substring(1);
+						}
+						if (word.charAt(l - 1) == ',') {
+							word = word.substring(0, l - 1) + " ,";
+						}
+					}
+				}
+			}
+			return String.join(" ", words);
+		});
+	}
+
+	public static PhraseMapper explodeCommas() {
+		return PhraseMapper.rename((phrase, freq) -> {
+			if (phrase.indexOf(',') >= 0) {
+				return normalizeSpaces(phrase.replace(",", " , "));
+			}
+			return phrase;
+		});
+	}
+
+	/**
+	 * This will remove all phrase which have more than {@code n} words.
+	 *
+	 * @param n The maximum number of words allowed per phrase.
+	 * @return
+	 */
+	public static PhraseMapper maxNGram(int n) {
+		return PhraseMapper.rename((phrase, freq) -> {
+			int words = 1;
+			int l = phrase.length();
+			for (int i = 0; i < l; i++) {
+				if (phrase.charAt(i) == ' ')
+					words++;
+			}
+			return words > n ? null : phrase;
+		});
+	}
+
+	private static final Pattern SPACES_PATTERN = Pattern.compile("\\s{2,}");
+
+	private static String normalizeSpaces(String str) {
+		return SPACES_PATTERN.matcher(str).replaceAll(" ").trim();
+	}
+
+}
diff --git a/src/main/java/org/netspeak/preprocessing/PhraseSource.java b/src/main/java/org/netspeak/preprocessing/PhraseSource.java
new file mode 100644
index 0000000..a12f73a
--- /dev/null
+++ b/src/main/java/org/netspeak/preprocessing/PhraseSource.java
@@ -0,0 +1,87 @@
+package org.netspeak.preprocessing;
+
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.List;
+
+import org.netspeak.io.PhraseReader;
+
+/**
+ * A source of phrases.
+ *
+ * @see SimplePhraseSource
+ */
+public interface PhraseSource {
+
+	Collection<File> getFiles() throws Exception;
+
+	public interface File {
+
+		Path getPath();
+
+		PhraseReader createReader() throws Exception;
+
+	}
+
+	PhraseSource EMPTY = combine();
+
+	/**
+	 * Returns a phrase source which contains the files of all the given sources.
+	 *
+	 * @param sources
+	 * @return
+	 */
+	static PhraseSource combine(PhraseSource... sources) {
+		return combine(Arrays.asList(sources));
+	}
+
+	/**
+	 * Returns a phrase source which contains the files of all the given sources.
+	 *
+	 * @param sources
+	 * @return
+	 */
+	static PhraseSource combine(Collection<PhraseSource> sources) {
+		final ArrayList<PhraseSource> src = new ArrayList<>(sources);
+		return new Combined() {
+
+			@Override
+			public Collection<PhraseSource> getSources() {
+				return src;
+			}
+
+			@Override
+			public Collection<File> getFiles() throws Exception {
+				List<File> files = new ArrayList<>();
+				for (PhraseSource source : src) {
+					files.addAll(source.getFiles());
+				}
+				return files;
+			}
+
+			@Override
+			public String toString() {
+				StringBuilder sb = new StringBuilder();
+				boolean first = true;
+				for (PhraseSource source : src) {
+					if (first) {
+						first = false;
+					} else {
+						sb.append("\n");
+					}
+					sb.append(source.toString());
+				}
+				return sb.toString();
+			}
+		};
+	}
+
+	interface Combined extends PhraseSource {
+
+		Collection<PhraseSource> getSources();
+
+	}
+
+}
diff --git a/src/main/java/org/netspeak/preprocessing/Pipeline.java b/src/main/java/org/netspeak/preprocessing/Pipeline.java
new file mode 100644
index 0000000..9f400bd
--- /dev/null
+++ b/src/main/java/org/netspeak/preprocessing/Pipeline.java
@@ -0,0 +1,28 @@
+package org.netspeak.preprocessing;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.netspeak.Util.ThrowsSupplier;
+
+public class Pipeline implements PipelineItem {
+
+	private final List<PipelineItem> items = new ArrayList<>();
+
+	public void add(PipelineItem item) {
+		items.add(item);
+	}
+
+	public void add(ThrowsSupplier<PipelineItem> supplier) {
+		items.add(supplier.get());
+	}
+
+	@Override
+	public PhraseSource apply(PhraseSource source) throws Exception {
+		for (PipelineItem item : items) {
+			source = item.apply(source);
+		}
+		return source;
+	}
+
+}
diff --git a/src/main/java/org/netspeak/preprocessing/PipelineItem.java b/src/main/java/org/netspeak/preprocessing/PipelineItem.java
new file mode 100644
index 0000000..02194ed
--- /dev/null
+++ b/src/main/java/org/netspeak/preprocessing/PipelineItem.java
@@ -0,0 +1,8 @@
+package org.netspeak.preprocessing;
+
+@FunctionalInterface
+public interface PipelineItem {
+
+	PhraseSource apply(PhraseSource source) throws Exception;
+
+}
diff --git a/src/main/java/org/netspeak/preprocessing/Preprocessing.java b/src/main/java/org/netspeak/preprocessing/Preprocessing.java
new file mode 100755
index 0000000..fe79176
--- /dev/null
+++ b/src/main/java/org/netspeak/preprocessing/Preprocessing.java
@@ -0,0 +1,371 @@
+package org.netspeak.preprocessing;
+
+import org.netspeak.Util;
+import org.netspeak.Util.ThrowsRunnable;
+import org.netspeak.io.PhraseFrequencyPair;
+import org.netspeak.io.PhraseReader;
+import org.netspeak.io.PhraseWriter;
+import org.netspeak.io.SimpleCsvReader;
+import org.netspeak.io.SimpleCsvWriter;
+import org.netspeak.io.SplitterCsvWriter;
+import org.netspeak.preprocessing.PreprocessingOptions.DeleteMode;
+
+import java.io.IOException;
+import java.math.BigDecimal;
+import java.math.RoundingMode;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.time.Duration;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Future;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.concurrent.atomic.LongAccumulator;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static java.util.Objects.requireNonNull;
+import static java.util.concurrent.TimeUnit.DAYS;
+
+/**
+ * <p>
+ * A class to convert a number of input phrase files to a number of output phrase files by applying user-defined filters
+ * on each input phrase.
+ * </p>
+ *
+ * <p>
+ * For that reason the user can register classes that implement the {@link PhraseMapper} interface to provide a certain
+ * filter functions. All filters will then be applied on each phrase in the same order they were registered. Some
+ * predefined {@link PhraseMapper} can be found in the {@link PhraseMappers} class.
+ * </p>
+ */
+public final class Preprocessing {
+
+	private Preprocessing() {
+	}
+
+	/**
+	 * Runs the entire preprocessing step which applies a number of filters on each phrase read from files located in
+	 * {@code phraseSrcDir}. As a precondition all files in {@code phraseSrcDir} must be formatted according to the
+	 * phrase file format as defined in {@code netspeak3-application-java-notes.txt}.
+	 * <p>
+	 * .zip files will automatically be opened and processed. It's assumed that a .zip file contains only .csv file.
+	 *
+	 * @param outputDir A directory to store output phrase files.
+	 * @param mappers A list of {@link PhraseMapper} objects.
+	 * @throws IOException if any I/O errors occurs.
+	 */
+	public static PhraseSource process(PhraseSource input, Path outputDir, Collection<PhraseMapper> mappers,
+	                                   PreprocessingOptions options) throws Exception {
+		requireNonNull(input);
+		requireNonNull(outputDir);
+		requireNonNull(mappers);
+		requireNonNull(options);
+
+		// make a copy of the procession options
+		options = new PreprocessingOptions(options);
+		long start = System.currentTimeMillis();
+
+		Util.createEmptyDirectory(outputDir);
+
+		PhraseMapper[] mapperArray = mappers.toArray(new PhraseMapper[0]);
+		MapperStats[] stats = options.verbose ? createStats(mapperArray) : null;
+
+		if (options.mergeDuplicates) {
+			Path tmp = outputDir.resolve("tmp");
+			Util.createEmptyDirectory(tmp);
+
+			// split all phrases by hash into different buckets such that duplicates are in
+			// the same bucket
+			try (SplitterCsvWriter writer = new SplitterCsvWriter(tmp, 1024)) {
+				System.out.println("Applying mappers.");
+				processAllFiles(options, input, file -> {
+					try (PhraseReader reader = file.createReader()) {
+						applyMappers(reader, writer, mapperArray, stats);
+					}
+				});
+			}
+
+			// use NetspeakCsvReader to read the output of SplitterNetspeakCsvWriter
+			SimplePhraseSource tmpSource = new SimplePhraseSource(tmp);
+			tmpSource.setReaderFactory(SimpleCsvReader::new);
+
+			// delete temp files
+			options.setDeleteSource(DeleteMode.PROGRESSIVE);
+
+			// use a simple HashMap to merge the duplicates
+			System.out.println("Merging phrases");
+			AtomicLong totalPhrasesCount = new AtomicLong(0);
+			AtomicLong totalDuplicatesCount = new AtomicLong(0);
+
+			processAllFiles(options, tmpSource, file -> {
+				Map<String, Long> map = new HashMap<>();
+				try (PhraseReader reader = file.createReader()) {
+					long phrases = 0;
+					AtomicLong dups = new AtomicLong(0);
+					PhraseFrequencyPair pair;
+					while ((pair = reader.nextPair()) != null) {
+						phrases++;
+						map.merge(pair.phrase, pair.frequency, (a, b) -> {
+							dups.incrementAndGet();
+							return a + b;
+						});
+					}
+					totalPhrasesCount.addAndGet(phrases - dups.get());
+					totalDuplicatesCount.addAndGet(dups.get());
+				}
+
+				// write map
+				Path out = outputDir.resolve(file.getPath().getFileName());
+				try (SimpleCsvWriter writer = new SimpleCsvWriter(Files.newBufferedWriter(out, UTF_8))) {
+					for (Entry<String, Long> entry : map.entrySet()) {
+						writer.write(entry.getKey(), entry.getValue());
+					}
+				}
+			});
+
+			double percentage = Math
+					.round(100. * 10. * totalDuplicatesCount.doubleValue() / totalPhrasesCount.doubleValue()) / 10.;
+			System.out.println("Total of " + totalPhrasesCount + " phrases with " + totalDuplicatesCount + " ("
+					+ percentage + "%) duplicates merged.");
+
+			// clean up
+			System.out.println("Deleting temporary directory");
+			Files.delete(tmp);
+		} else {
+
+			System.out.println("Applying mappers.");
+			processAllFiles(options, input, file -> {
+				String outFileName = file.getPath().getFileName().toString().replaceFirst("(?i).csv[^\\\\/]*", "")
+						+ ".csv";
+				Path out = outputDir.resolve(Paths.get(outFileName));
+				try (PhraseReader reader = file.createReader();
+				     SimpleCsvWriter writer = new SimpleCsvWriter(Files.newBufferedWriter(out, UTF_8))) {
+					applyMappers(reader, writer, mapperArray, stats);
+				}
+			});
+		}
+
+		printStats(stats);
+
+		long end = System.currentTimeMillis();
+		System.out.println("Took " + readableDuration(Duration.ofMillis(end - start)));
+		System.out.println("Done.");
+
+		return new SimplePhraseSource(outputDir);
+	}
+
+	/**
+	 * This will iterate over all phases just as {@link #process(PhraseSource, Path, Collection, PreprocessingOptions)}
+	 * would but without changing the file system.
+	 * <p>
+	 * All mappers can be thought of as consumers.
+	 *
+	 * @param mappers A list of {@link PhraseMapper} objects.
+	 * @throws IOException if any I/O errors occurs.
+	 */
+	public static void iterate(PhraseSource input, Collection<PhraseMapper> mappers, PreprocessingOptions options)
+			throws Exception {
+		requireNonNull(input);
+		requireNonNull(mappers);
+		requireNonNull(options);
+
+		// make a copy of the procession options
+		options = new PreprocessingOptions(options);
+		options.setDeleteSource(DeleteMode.NONE);
+		long start = System.currentTimeMillis();
+
+		System.out.println("Applying mappers.");
+		PhraseMapper[] mapperArray = mappers.toArray(new PhraseMapper[0]);
+		MapperStats[] stats = options.verbose ? createStats(mapperArray) : null;
+		processAllFiles(options, input, file -> {
+			try (PhraseReader reader = file.createReader()) {
+				applyMappers(reader, null, mapperArray, stats);
+			}
+		});
+
+		printStats(stats);
+
+		long end = System.currentTimeMillis();
+		System.out.println("Took " + readableDuration(Duration.ofMillis(end - start)));
+		System.out.println("Done.");
+	}
+
+	private static void processAllFiles(PreprocessingOptions options, PhraseSource input, ProcessAllConsumer consumer)
+			throws Exception {
+		ExecutorService executor = Executors.newFixedThreadPool(options.parallelDegree);
+		DeleteMode deleteSource = options.deleteSource;
+		try {
+			List<Future<Path>> futures = new ArrayList<>();
+			int i = 0;
+			Collection<PhraseSource.File> files = input.getFiles();
+			for (final PhraseSource.File file : files) {
+				int currentIndex = ++i;
+				futures.add(executor.submit((ThrowsRunnable) () -> {
+					int percent = currentIndex * 100 / files.size();
+					String prefix = "[" + new Date() + "][" + percent + "% " + currentIndex + "/" + files.size() + "] ";
+					System.out.println(prefix + "Preprocessing " + file);
+
+					consumer.accept(file);
+
+					if (deleteSource == DeleteMode.PROGRESSIVE) {
+						Files.delete(file.getPath());
+					}
+				}, file.getPath()));
+			}
+			Util.getAll(futures); // wait for all tasks to complete
+
+			if (deleteSource == DeleteMode.ATOMIC) {
+				for (final PhraseSource.File file : files) {
+					Files.delete(file.getPath());
+				}
+			}
+		} finally {
+			executor.shutdown();
+			executor.awaitTermination(100, DAYS);
+		}
+	}
+
+	@FunctionalInterface
+	private interface ProcessAllConsumer {
+
+		void accept(PhraseSource.File file) throws Exception;
+
+	}
+
+	private static void applyMappers(PhraseReader reader, PhraseWriter writer, PhraseMapper[] mappers,
+	                                 MapperStats[] stats) throws Exception {
+		PhraseFrequencyPair pair;
+		while ((pair = reader.nextPair()) != null) {
+			String newPhrase = mapAll(pair.phrase, pair.frequency, mappers, stats);
+			if (newPhrase != null && writer != null) {
+				writer.write(newPhrase, pair.frequency);
+			}
+		}
+	}
+
+	private static String mapAll(String phrase, long frequency, PhraseMapper[] mappers, MapperStats[] stats) {
+		if (stats == null) {
+			for (PhraseMapper mapper : mappers) {
+				if (phrase == null || phrase.isEmpty())
+					return null;
+				phrase = mapper.map(phrase, frequency);
+			}
+			return phrase == null || phrase.isEmpty() ? null : phrase;
+		} else {
+			return mapAllWithStats(phrase, frequency, mappers, stats);
+		}
+	}
+
+	private static String mapAllWithStats(String phrase, long frequency, PhraseMapper[] mappers, MapperStats[] stats) {
+		if (phrase == null || phrase.isEmpty())
+			return null;
+
+		for (int i = 0; i < mappers.length; i++) {
+			long start = System.nanoTime();
+			String newPhrase = mappers[i].map(phrase, frequency);
+			long time = System.nanoTime() - start;
+			MapperStats s = stats[i];
+			s.phrasesTotal.accumulate(1);
+			s.runTime.accumulate(time);
+
+			if (newPhrase == null || newPhrase.isEmpty()) {
+				s.phrasesRemoved.accumulate(1);
+				return null;
+			} else {
+				if (phrase.contentEquals(newPhrase)) {
+					s.phrasesLeftUnchanged.accumulate(1);
+				} else {
+					s.phrasesChanged.accumulate(1);
+					phrase = newPhrase;
+				}
+			}
+		}
+		return phrase;
+	}
+
+	private static MapperStats[] createStats(PhraseMapper[] mappers) {
+		MapperStats[] stats = new MapperStats[mappers.length];
+		for (int i = 0; i < mappers.length; i++) {
+			stats[i] = new MapperStats(mappers[i]);
+		}
+		return stats;
+	}
+
+	private static void printStats(MapperStats[] stats) {
+		if (stats == null)
+			return;
+
+		System.out.println();
+		for (MapperStats s : stats) {
+			long total = s.phrasesTotal.get();
+			long changed = s.phrasesChanged.get();
+			long kept = s.phrasesLeftUnchanged.get();
+			long removed = s.phrasesRemoved.get();
+			double runTime = s.runTime.get();
+
+			System.out.println("Mapper: " + s.mapper.getName());
+			System.out.println("  total  : " + padStart(total, 12));
+			if (total > 0) {
+				double t = total;
+				System.out.println("  removed: " + padStart(removed, 12) + " (" + percent(removed / t, 2) + ")");
+				System.out.println("  changed: " + padStart(changed, 12) + " (" + percent(changed / t, 2) + ")");
+				System.out.println("  kept   : " + padStart(kept, 12) + " (" + percent(kept / t, 2) + ")");
+				System.out.println("  time/phrase: " + round(runTime / total, 2) + "ns/p");
+			}
+		}
+		System.out.println();
+	}
+
+	private static String padStart(Object o, int length) {
+		String s = String.valueOf(o);
+
+		if (s.length() >= length)
+			return s;
+
+		char[] spaces = new char[length - s.length()];
+		for (int i = 0; i < spaces.length; i++) {
+			spaces[i] = ' ';
+		}
+
+		return new String(spaces) + s;
+	}
+
+	private static String percent(double value, int precision) {
+		return round(value * 100, precision) + "%";
+	}
+
+	private static String round(double value, int precision) {
+		return BigDecimal.valueOf(value).setScale(precision, RoundingMode.HALF_UP).toString();
+	}
+
+	private static String readableDuration(Duration duration) {
+		return duration.toString().substring(2).replaceAll("(\\d[HMS])(?!$)", "$1 ").toLowerCase();
+	}
+
+	private static class MapperStats {
+
+		public final PhraseMapper mapper;
+		public final LongAccumulator phrasesTotal = new LongAccumulator(Long::sum, 0);
+		public final LongAccumulator phrasesRemoved = new LongAccumulator(Long::sum, 0);
+		public final LongAccumulator phrasesChanged = new LongAccumulator(Long::sum, 0);
+		public final LongAccumulator phrasesLeftUnchanged = new LongAccumulator(Long::sum, 0);
+		/**
+		 * The total run time of the mapper in ns.
+		 */
+		public final LongAccumulator runTime = new LongAccumulator(Long::sum, 0);
+
+		public MapperStats(PhraseMapper mapper) {
+			this.mapper = mapper;
+		}
+
+	}
+
+}
diff --git a/src/main/java/org/netspeak/preprocessing/PreprocessingOptions.java b/src/main/java/org/netspeak/preprocessing/PreprocessingOptions.java
new file mode 100644
index 0000000..d2cb2be
--- /dev/null
+++ b/src/main/java/org/netspeak/preprocessing/PreprocessingOptions.java
@@ -0,0 +1,87 @@
+package org.netspeak.preprocessing;
+
+import static java.util.Objects.requireNonNull;
+
+public class PreprocessingOptions {
+	int parallelDegree = 1;
+	boolean mergeDuplicates = false;
+	DeleteMode deleteSource = DeleteMode.NONE;
+	boolean verbose = false;
+
+	public PreprocessingOptions() {
+	}
+
+	public PreprocessingOptions(PreprocessingOptions toCopy) {
+		parallelDegree = toCopy.parallelDegree;
+		mergeDuplicates = toCopy.mergeDuplicates;
+		deleteSource = toCopy.deleteSource;
+		verbose = toCopy.verbose;
+	}
+
+	/**
+	 * Sets the maximum number of concurrently processed files.
+	 * <p>
+	 * This defaults {@code 1} meaning that files will processed in a single thread.
+	 *
+	 * @param parallelDegree
+	 */
+	public void setParallelDegree(int parallelDegree) {
+		this.parallelDegree = parallelDegree;
+	}
+
+	/**
+	 * Sets whether to merge duplicate phrases between and within files.
+	 * <p>
+	 * This option is necessary if your phrases contain duplicates.
+	 * <p>
+	 * This defaults to {@code false}.
+	 *
+	 * @param mergeDuplicates
+	 */
+	public void setMergeDuplicates(boolean mergeDuplicates) {
+		this.mergeDuplicates = mergeDuplicates;
+	}
+
+	/**
+	 * Sets whether the source files will be deleted after they were read.
+	 * <p>
+	 * This option is useful to automatically remove temporary files.
+	 * <p>
+	 * This defaults to {@link DeleteMode#NONE}.
+	 *
+	 * @param deleteSource
+	 */
+	public void setDeleteSource(DeleteMode deleteSource) {
+		this.deleteSource = requireNonNull(deleteSource);
+	}
+
+	public enum DeleteMode {
+		/**
+		 * No files will be deleted.
+		 */
+		NONE,
+		/**
+		 * All files will be deleted at once after all files have been read.
+		 */
+		ATOMIC,
+		/**
+		 * Files will be deleted as soon as possible.
+		 */
+		PROGRESSIVE
+	}
+
+	/**
+	 * Sets whether additional information about the preprocessing step should be
+	 * logged in the console.
+	 * <p>
+	 * Note: Enabling this might make the preprocessing slower.
+	 * <p>
+	 * This defaults to {code false}.
+	 *
+	 * @param verbose
+	 */
+	public void setVerbose(boolean verbose) {
+		this.verbose = verbose;
+	}
+
+}
diff --git a/src/main/java/org/netspeak/preprocessing/SimplePhraseSource.java b/src/main/java/org/netspeak/preprocessing/SimplePhraseSource.java
new file mode 100644
index 0000000..3267865
--- /dev/null
+++ b/src/main/java/org/netspeak/preprocessing/SimplePhraseSource.java
@@ -0,0 +1,199 @@
+package org.netspeak.preprocessing;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static java.util.Objects.requireNonNull;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.file.DirectoryStream.Filter;
+import java.nio.file.FileSystems;
+import java.nio.file.FileVisitResult;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.PathMatcher;
+import java.nio.file.Paths;
+import java.nio.file.SimpleFileVisitor;
+import java.nio.file.attribute.BasicFileAttributes;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.List;
+import java.util.function.Function;
+import java.util.zip.GZIPInputStream;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
+
+import org.apache.commons.compress.compressors.CompressorStreamFactory;
+import org.netspeak.io.PhraseReader;
+import org.netspeak.io.SimpleCsvReader;
+
+public class SimplePhraseSource implements PhraseSource {
+
+	final Path path;
+	Function<BufferedReader, PhraseReader> readerFactory = SimpleCsvReader::new;
+	private Filter<Path> fileFilter;
+
+	public SimplePhraseSource(Path path) {
+		this.path = requireNonNull(path);
+	}
+
+	public SimplePhraseSource(String path) {
+		this.path = Paths.get(requireNonNull(path));
+	}
+
+	@Override
+	public String toString() {
+		return path.toString();
+	}
+
+	/**
+	 * Sets the factory to create a new {@link PhraseReader} from the given
+	 * {@link BufferedReader}.
+	 * <p>
+	 * This defaults to {@code NetspeakCsvReader::new}.
+	 *
+	 * @param readerFactory
+	 */
+	public void setReaderFactory(Function<BufferedReader, PhraseReader> readerFactory) {
+		this.readerFactory = requireNonNull(readerFactory);
+	}
+
+	/**
+	 * Sets a filter which decides whether a file will be processed.
+	 * <p>
+	 * This defaults to {@code null} meaning that all files in the given directory
+	 * will be processed.
+	 *
+	 * @param fileFilter
+	 */
+	public void setFileFilter(Filter<Path> fileFilter) {
+		this.fileFilter = fileFilter;
+	}
+
+	/**
+	 * Sets a glob pattern which decides whether a file will be processed.
+	 * <p>
+	 * This defaults to {@code null} meaning that all files in the given directory
+	 * will be processed.
+	 *
+	 * @param globPattern
+	 */
+	public void setFileFilter(String globPattern) {
+		if (globPattern == null) {
+			this.fileFilter = null;
+		} else {
+			final PathMatcher pathMatcher = FileSystems.getDefault().getPathMatcher(globPattern);
+			this.fileFilter = pathMatcher::matches;
+		}
+	}
+
+	@Override
+	public Collection<PhraseSource.File> getFiles() throws Exception {
+		if (!Files.isDirectory(path)) {
+			throw new AssertionError("Not a directory " + path);
+		}
+
+		List<PhraseSource.File> files = new ArrayList<>();
+		SimplePhraseSource that = this;
+
+		Files.walkFileTree(path, new SimpleFileVisitor<Path>() {
+			@Override
+			public FileVisitResult visitFile(Path path, BasicFileAttributes attrs) throws IOException {
+				if (fileFilter == null || fileFilter.accept(path)) {
+					files.add(new PhrasesSourceFile(that, path));
+				}
+				return FileVisitResult.CONTINUE;
+			}
+		});
+
+		return files;
+	}
+
+	private static class PhrasesSourceFile implements PhraseSource.File {
+
+		private final SimplePhraseSource source;
+		private final Path path;
+
+		public PhrasesSourceFile(SimplePhraseSource source, Path path) {
+			this.source = requireNonNull(source);
+			this.path = requireNonNull(path);
+		}
+
+		@Override
+		public Path getPath() {
+			return path;
+		}
+
+		@Override
+		public String toString() {
+			return path.toString();
+		}
+
+		@Override
+		public PhraseReader createReader() throws Exception {
+			BufferedReader br;
+
+			String lowerPath = path.toString().toLowerCase();
+			if (lowerPath.endsWith(".zip")) {
+				br = createZipReader();
+			} else if (lowerPath.endsWith(".bz2")) {
+				br = createBz2Reader();
+			} else if (lowerPath.endsWith(".gz")) {
+				br = createGZipReader();
+			} else {
+				br = Files.newBufferedReader(path, UTF_8);
+			}
+
+			try {
+				return source.readerFactory.apply(br);
+			} catch (Throwable e) {
+				br.close();
+				throw e;
+			}
+		}
+
+		private BufferedReader createZipReader() throws Exception {
+			// we assume that the .zip contains only one file which is a CSV file
+			BufferedInputStream bis = null;
+			ZipInputStream zip = null;
+			try {
+				bis = new BufferedInputStream(Files.newInputStream(path));
+				zip = new ZipInputStream(bis);
+				ZipEntry entry = zip.getNextEntry();
+				if (entry == null) {
+					throw new IllegalStateException("The .zip file is empty.");
+				}
+				if (!entry.getName().toLowerCase().endsWith(".csv")) {
+					throw new IllegalStateException("The .zip file is only allowed to contain a CSV file.");
+				}
+				return new BufferedReader(new InputStreamReader(zip, UTF_8));
+			} catch (Throwable t) {
+				if (bis != null)
+					bis.close();
+				if (zip != null)
+					zip.close();
+				throw t;
+			}
+		}
+
+		private BufferedReader createBz2Reader() throws Exception {
+			BufferedInputStream bis = null;
+			try {
+				bis = new BufferedInputStream(Files.newInputStream(path));
+				return new BufferedReader(
+						new InputStreamReader(new CompressorStreamFactory().createCompressorInputStream(bis), UTF_8));
+			} catch (Throwable t) {
+				if (bis != null)
+					bis.close();
+				throw t;
+			}
+		}
+
+		private BufferedReader createGZipReader() throws IOException {
+			return new BufferedReader(new InputStreamReader(new GZIPInputStream(Files.newInputStream(path)), UTF_8));
+		}
+
+	}
+
+}
diff --git a/src/main/java/org/netspeak/preprocessing/VocabularyExtractor.java b/src/main/java/org/netspeak/preprocessing/VocabularyExtractor.java
new file mode 100644
index 0000000..c3ed67f
--- /dev/null
+++ b/src/main/java/org/netspeak/preprocessing/VocabularyExtractor.java
@@ -0,0 +1,99 @@
+package org.netspeak.preprocessing;
+
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.StandardOpenOption;
+import java.util.ArrayList;
+import java.util.LinkedHashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.atomic.LongAccumulator;
+import java.util.function.Function;
+
+import org.netspeak.io.PhraseFrequencyPair;
+import org.netspeak.io.PhraseWriter;
+import org.netspeak.io.SimpleCsvWriter;
+
+/**
+ * A phrase mapper that will create a vocabulary from all phrases it sees.
+ * <p>
+ * This mapper will not change any phrases.
+ */
+public class VocabularyExtractor implements PhraseMapper {
+
+	private Map<String, LongAccumulator> vocabulary = new ConcurrentHashMap<>();
+	private List<PhraseFrequencyPair> list;
+
+	private List<PhraseFrequencyPair> getPairs() {
+		if (list == null) {
+			list = new ArrayList<>();
+			vocabulary.forEach((phrase, counter) -> {
+				list.add(new PhraseFrequencyPair(phrase, counter.get()));
+			});
+			vocabulary = null;
+
+			list.sort((a, b) -> {
+				if (a.frequency > b.frequency) {
+					return -1;
+				} else if (a.frequency < b.frequency) {
+					return 1;
+				}
+				return a.phrase.compareTo(b.phrase);
+			});
+		}
+		return list;
+	}
+
+	@Override
+	public String map(String phrase, long frequency) {
+		for (String word : phrase.split(" ")) {
+			LongAccumulator counter = vocabulary.computeIfAbsent(word, key -> new LongAccumulator(Long::max, 0));
+			counter.accumulate(frequency);
+		}
+		return phrase;
+	}
+
+	public void writePairs(PhraseWriter writer) throws Exception {
+		for (PhraseFrequencyPair pair : getPairs()) {
+			writer.write(pair);
+		}
+	}
+
+	public void writePairs(Path file) throws Exception {
+		writePairs(file, SimpleCsvWriter::new);
+	}
+
+	public void writePairs(Path file, Function<BufferedWriter, PhraseWriter> writerFactory) throws Exception {
+		try (BufferedWriter writer = Files.newBufferedWriter(file, StandardOpenOption.CREATE);
+				PhraseWriter phraseWriter = writerFactory.apply(writer)) {
+			writePairs(phraseWriter);
+		}
+	}
+
+	public void writeVocabulary(BufferedWriter writer) throws IOException {
+		String newLine = "\n";
+		for (PhraseFrequencyPair pair : getPairs()) {
+			writer.write(pair.phrase);
+			writer.write(newLine);
+		}
+	}
+
+	public void writeVocabulary(Path file) throws IOException {
+		try (BufferedWriter writer = Files.newBufferedWriter(file, StandardOpenOption.CREATE)) {
+			writeVocabulary(writer);
+		}
+	}
+
+	public Set<String> getVocabulary() {
+		Set<String> set = new LinkedHashSet<>();
+		for (PhraseFrequencyPair pair : getPairs()) {
+			set.add(pair.phrase);
+		}
+		return set;
+	}
+
+}
diff --git a/src/main/java/org/netspeak/usage/NetspeakBuilderUsage.java b/src/main/java/org/netspeak/usage/NetspeakBuilderUsage.java
new file mode 100644
index 0000000..d553f3b
--- /dev/null
+++ b/src/main/java/org/netspeak/usage/NetspeakBuilderUsage.java
@@ -0,0 +1,46 @@
+package org.netspeak.usage;
+
+import org.netspeak.Util;
+import org.netspeak.preprocessing.ContractionMapper;
+import org.netspeak.preprocessing.PhraseMapper;
+import org.netspeak.preprocessing.PhraseMappers;
+import org.netspeak.preprocessing.PhraseSource;
+import org.netspeak.preprocessing.Pipeline;
+import org.netspeak.preprocessing.Preprocessing;
+import org.netspeak.preprocessing.PreprocessingOptions;
+import org.netspeak.preprocessing.SimplePhraseSource;
+
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Arrays;
+
+/**
+ * Demonstrates the usage of {@link Preprocessing}.
+ */
+public final class NetspeakBuilderUsage {
+
+	public static void main(String[] args) throws Exception {
+
+		Pipeline pipeline = new Pipeline();
+
+		pipeline.add(source -> {
+			Path output = Paths.get("C:\\Netspeak\\_out");
+
+			PreprocessingOptions options = new PreprocessingOptions();
+			options.setParallelDegree(8);
+			options.setMergeDuplicates(true);
+
+			Path conFile = Paths.get("D:\\netspeak\\contractions_eng.txt");
+			ContractionMapper con = new ContractionMapper(conFile);
+
+			PhraseMapper superBalcklist = PhraseMappers
+					.superBlacklist(Util.readWordList(Paths.get("D:\\netspeak\\super_blacklist.txt")));
+
+			return Preprocessing.process(source, output, Arrays.asList(superBalcklist, con), options);
+		});
+
+		SimplePhraseSource source1 = new SimplePhraseSource("C:\\Netspeak\\processed_corpora\\eng_ci_web+books");
+		pipeline.apply(PhraseSource.combine(source1));
+	}
+
+}
diff --git a/src/main/java/org/netspeak/usage/NetspeakTerminal.java b/src/main/java/org/netspeak/usage/NetspeakTerminal.java
new file mode 100755
index 0000000..92b299b
--- /dev/null
+++ b/src/main/java/org/netspeak/usage/NetspeakTerminal.java
@@ -0,0 +1,61 @@
+package org.netspeak.usage;
+
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.util.Scanner;
+
+import org.netspeak.Configuration;
+import org.netspeak.Netspeak;
+import org.netspeak.NetspeakUtil;
+import org.netspeak.generated.NetspeakMessages.Request;
+import org.netspeak.generated.NetspeakMessages.Response;
+
+/**
+ * Runs an interactive prompt to search Netspeak via command line.
+ */
+public class NetspeakTerminal {
+
+	public static void main(String[] args) throws Exception {
+
+		// ---------------------------------------------------------------------
+		// CONFIGURATION
+		// ---------------------------------------------------------------------
+		Configuration config = new Configuration();
+		config.put(Configuration.PATH_TO_HOME, "/media/michael/Volume/data-in-production/netspeak/netspeak3-web-en");
+		config.put(Configuration.CACHE_CAPACITY, "10000");
+
+		// ---------------------------------------------------------------------
+		// START NETSPEAK
+		// ---------------------------------------------------------------------
+		Netspeak netspeak = new Netspeak(config);
+
+		// ---------------------------------------------------------------------
+		// TERMINAL INTERACTION
+		// ---------------------------------------------------------------------
+		PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out));
+
+		try (final Scanner scanner = new Scanner(System.in);) {
+			Request.Builder rb = Request.newBuilder();
+			while (true) {
+				pw.print("\nEnter query (type 'q' to exit): ");
+				pw.flush();
+				String query = scanner.nextLine();
+				if (query.equals("q"))
+					break;
+				long start = System.currentTimeMillis();
+				Request request = rb.setQuery(query).build();
+				Response response = netspeak.search(request);
+				for (int i = 0; i != response.getPhraseCount(); ++i) {
+					System.out.printf("%-5d%-15d%s\n", i, response.getPhrase(i).getFrequency(),
+							NetspeakUtil.toString(response.getPhrase(i)));
+				}
+				System.out.println("Error code: " + response.getErrorCode());
+				System.out.println("Error message: " + response.getErrorMessage());
+				System.out.println("Tokenized query: " + String.join(" ", response.getQueryTokenList()));
+				System.out.println("Parsed query: " + NetspeakUtil.toString(response.getQuery()));
+				System.out.println("Time: " + (System.currentTimeMillis() - start));
+				rb = request.toBuilder();
+			}
+		}
+	}
+}
diff --git a/src/main/java/org/netspeak/usage/NetspeakUsage.java b/src/main/java/org/netspeak/usage/NetspeakUsage.java
new file mode 100755
index 0000000..f8bb6aa
--- /dev/null
+++ b/src/main/java/org/netspeak/usage/NetspeakUsage.java
@@ -0,0 +1,112 @@
+package org.netspeak.usage;
+
+import java.util.Map;
+
+import org.netspeak.Configuration;
+import org.netspeak.ErrorCode;
+import org.netspeak.Netspeak;
+import org.netspeak.NetspeakUtil;
+import org.netspeak.generated.NetspeakMessages.Request;
+import org.netspeak.generated.NetspeakMessages.Response;
+
+import com.google.protobuf.InvalidProtocolBufferException;
+
+public class NetspeakUsage {
+
+	public static void main(String[] args) {
+
+		// ---------------------------------------------------------------------
+		// CONFIGURATION
+		// ---------------------------------------------------------------------
+		Configuration config = new Configuration();
+		config.put(Configuration.PATH_TO_HOME, "/media/michael/Volume/data-in-production/netspeak/netspeak3-web-en");
+		config.put(Configuration.CACHE_CAPACITY, "10000");
+
+		// ---------------------------------------------------------------------
+		// START NETSPEAK
+		// ---------------------------------------------------------------------
+		Netspeak netspeak = new Netspeak(config);
+
+		// ---------------------------------------------------------------------
+		// SEARCH NETSPEAK
+		// ---------------------------------------------------------------------
+		Request.Builder rb = Request.newBuilder();
+		rb.setQuery("programming is *");
+		// Advanced parameters (optional)
+//		rb.setMaxPhraseCount(int);    // default: 100 (find at most X n-grams)
+//		rb.setPhraseLengthMin(int);   // default: 1   (minimum n-gram length)
+//		rb.setPhraseLengthMax(int);   // default: 5   (maximum n-gram length)
+
+		Request request = rb.build();
+		Response response = null;
+		try {
+			response = netspeak.search(request);
+		} catch (InvalidProtocolBufferException e) {
+			e.printStackTrace();
+		}
+
+		// Tip: As you will see, there are no setter methods to prepare your
+		// request object with a new query for your next search. But you can
+		// and you should reuse your request object like that:
+		// request = request.toBuilder().setQuery("be efficient and ?").build();
+
+		// ---------------------------------------------------------------------
+		// ERROR HANDLING
+		// ---------------------------------------------------------------------
+		// A Netspeak search will never throw any exceptions.
+		// Errors are indicated by the response's error code.
+		System.out.println("Error: " + response.getErrorCode());
+		switch (ErrorCode.fromCode(response.getErrorCode())) {
+		case NO_ERROR:
+			// ...
+			break;
+		case INVALID_QUERY:
+			// ...
+			break;
+		case SERVER_ERROR:
+			// ...
+			break;
+		case UNKNOWN_ERROR:
+			// ...
+			break;
+		}
+
+		// You can also handle errors like this:
+		// if (ErrorCode.cast(response.getErrorCode()) != ErrorCode.NO_ERROR)
+
+		// ---------------------------------------------------------------------
+		// READ RESPONSE
+		// ---------------------------------------------------------------------
+		// Returns the total frequency (100% basis) of the returned n-grams.
+		// This is not the same value as the sum of all n-gram frequencies.
+		System.out.println("Total frequency: " + response.getTotalFrequency());
+		// Returns the tokenized query string produced by the query lexer.
+		System.out.println("Tokenized query: " + String.join(" ", response.getQueryTokenList()));
+		// Returns the parsed (valid) query produced by the query parser.
+		System.out.println("Parsed query: " + NetspeakUtil.toString(response.getQuery()));
+		// Returns the request object.
+		System.out.println("Request was: " + response.getRequest());
+
+		// Loop through the returned phrases
+		for (int i = 0; i != response.getPhraseCount(); ++i) {
+			System.out.printf("%-5d%-15d%s\n", i, response.getPhrase(i).getFrequency(),
+					response.getPhrase(i).toString());
+		}
+
+		// You can also iterate like that:
+		// for (Phrase phrase : response.getPhraseList()) {
+		// System.out.println(phrase); // Complete phrase in JSON style
+		// }
+
+		// ---------------------------------------------------------------------
+		// NETSPEAK PROPERTIES (Some interesting values)
+		// ---------------------------------------------------------------------
+		try {
+			for (Map.Entry<String, String> entry : netspeak.getProperties().entrySet()) {
+				System.out.println(entry);
+			}
+		} catch (InvalidProtocolBufferException e) {
+			e.printStackTrace();
+		}
+	}
+}
diff --git a/src/main/java/org/netspeak/usage/PreprocessingUsage.java b/src/main/java/org/netspeak/usage/PreprocessingUsage.java
new file mode 100644
index 0000000..b42f00f
--- /dev/null
+++ b/src/main/java/org/netspeak/usage/PreprocessingUsage.java
@@ -0,0 +1,152 @@
+package org.netspeak.usage;
+
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.Arrays;
+
+import org.netspeak.Util;
+import org.netspeak.Util.ThrowsRunnable;
+import org.netspeak.preprocessing.ContractionMapper;
+import org.netspeak.preprocessing.HyphenationJoiner;
+import org.netspeak.preprocessing.Operations;
+import org.netspeak.preprocessing.Operations.StandardOperationsOptions;
+import org.netspeak.preprocessing.PhraseMappers;
+import org.netspeak.preprocessing.PhraseSource;
+import org.netspeak.preprocessing.Pipeline;
+import org.netspeak.preprocessing.Preprocessing;
+import org.netspeak.preprocessing.PreprocessingOptions;
+import org.netspeak.preprocessing.PreprocessingOptions.DeleteMode;
+import org.netspeak.preprocessing.SimplePhraseSource;
+
+public class PreprocessingUsage {
+
+	/*
+	 * You have to specify two temporary directories.
+	 *
+	 * Ideally, these should be your fastest storage capable of holding the whole
+	 * data set of a pipeline. This means you can read the data from a HDD, process
+	 * it on an SSD.
+	 */
+
+	static Path temp1 = Paths.get("path/to/temp1");
+	static Path temp2 = Paths.get("path/to/temp2");
+
+	public static void main(String[] args) throws Exception {
+		useTemp(() -> {
+
+			PhraseSource german = new SimplePhraseSource("path/to/german/data");
+			processGerman(german, Paths.get("out/german"));
+
+		});
+	}
+
+	private static void useTemp(ThrowsRunnable runnable) throws Exception {
+		// clear temporary directories before and after pre-processing
+		Util.delete(temp1, true);
+		Util.delete(temp2, true);
+		try {
+			runnable.runThrowing();
+		} finally {
+			Util.delete(temp1, true);
+			Util.delete(temp2, true);
+		}
+	}
+
+	/**
+	 *
+	 * @throws Exception
+	 */
+	static void processGerman(PhraseSource source, Path outDir) throws Exception {
+		Pipeline pipeline = new Pipeline();
+
+		pipeline.add(() -> {
+			Path output = temp1;
+
+			StandardOperationsOptions operationOptions = new StandardOperationsOptions();
+			operationOptions.setSuperBlacklist(Util.readResourceWordList("super-blacklist.txt"));
+			operationOptions.setBlacklist(Util.readResourceWordList("blacklist.txt"));
+			operationOptions.setBlacklistCombinations(4);
+			operationOptions.setMaxNGram(5);
+			operationOptions.setToLowerCase(false);
+
+			operationOptions.getAdditionalMappers()
+					.add(new ContractionMapper(Util.readResourceWordList("eng/contractions.txt")));
+
+			PreprocessingOptions options = new PreprocessingOptions();
+			options.setParallelDegree(8);
+
+			return Operations.standardOperations(output, operationOptions, options);
+		});
+
+		pipeline.add(() -> {
+			Path output = temp2;
+
+			PreprocessingOptions options = new PreprocessingOptions();
+			options.setParallelDegree(8);
+			options.setDeleteSource(DeleteMode.PROGRESSIVE); // delete files from temp
+
+			HyphenationJoiner.German german = new HyphenationJoiner.German();
+			german.setStopWordList(Util.readResourceWordList("ger/stop-words.txt"));
+
+			return new HyphenationJoiner(german, output, options);
+		});
+
+		pipeline.add(Operations.moveTo(outDir));
+
+		pipeline.apply(source);
+	}
+
+	static void processEnglish(PhraseSource source, Path outDir) throws Exception {
+		Pipeline pipeline = new Pipeline();
+
+		pipeline.add(() -> {
+			Path output = temp1;
+
+			StandardOperationsOptions operationOptions = new StandardOperationsOptions();
+			operationOptions.setSuperBlacklist(Util.readResourceWordList("super-blacklist.txt"));
+			operationOptions.setBlacklist(Util.readResourceWordList("blacklist.txt"));
+			operationOptions.setBlacklistCombinations(4);
+			operationOptions.setMaxNGram(5);
+			operationOptions.setToLowerCase(false);
+
+			PreprocessingOptions options = new PreprocessingOptions();
+			options.setParallelDegree(8);
+
+			return Operations.standardOperations(output, operationOptions, options);
+		});
+
+		pipeline.add(() -> {
+			Path output = temp2;
+
+			PreprocessingOptions options = new PreprocessingOptions();
+			options.setParallelDegree(8);
+			options.setDeleteSource(DeleteMode.PROGRESSIVE); // delete files from temp
+
+			HyphenationJoiner.English english = new HyphenationJoiner.English();
+
+			return new HyphenationJoiner(english, output, options);
+		});
+
+		pipeline.add(Operations.moveTo(outDir));
+
+		pipeline.apply(source);
+	}
+
+	static void toLowerCase(PhraseSource source, Path outDir) throws Exception {
+		Pipeline pipeline = new Pipeline();
+
+		pipeline.add(inputSource -> {
+			Path output = temp2;
+
+			PreprocessingOptions options = new PreprocessingOptions();
+			options.setParallelDegree(8);
+
+			return Preprocessing.process(inputSource, output, Arrays.asList(PhraseMappers.toLowerCase()), options);
+		});
+
+		pipeline.add(Operations.moveTo(outDir));
+
+		pipeline.apply(source);
+	}
+
+}
diff --git a/src/main/resources/blacklist.txt b/src/main/resources/blacklist.txt
new file mode 100644
index 0000000..96f9ba1
--- /dev/null
+++ b/src/main/resources/blacklist.txt
@@ -0,0 +1,67 @@
+'
+"
+„
+“
+„
+”
+«
+»
+`
+´
+
+--
+---
+----
++
+=
+~
+*
+%
+#
+..
+...
+....
+:
+|
+(
+)
+[
+]
+{
+}
+<
+>
+^
+@
+/
+\
+&
+
+$
+€
+£
+¥
+
+Ã
+¼
+¤
+
+²
+³
+
+—
+•
+■
+¬
+→
+·
+_
+…
+®
+©
+█
+™
+♥
+Ȳ
+¶
+±
diff --git a/src/main/resources/eng/contractions.txt b/src/main/resources/eng/contractions.txt
new file mode 100644
index 0000000..4c1baca
--- /dev/null
+++ b/src/main/resources/eng/contractions.txt
@@ -0,0 +1,9 @@
+i'm
+(he|she|it)'s
+(you|we|they)'re
+
+(i|you|he|she|it|we|they)'(d|ll|ve)
+
+y'all
+
+(have|has|had|do|does|did|is|are|ai|was|were|wo|would|ca|could|sha|must|need)n't
diff --git a/src/main/resources/ger/stop-words.txt b/src/main/resources/ger/stop-words.txt
new file mode 100644
index 0000000..310c68e
--- /dev/null
+++ b/src/main/resources/ger/stop-words.txt
@@ -0,0 +1,68 @@
+,
+der
+und
+die
+in
+von
+zu
+den
+mit
+für
+des
+das
+auf
+nicht
+im
+sich
+dem
+eine
+ein
+!
+an
+auch
+als
+bei
+?
+oder
+aus
+nach
+zum
+einer
+zur
+wie
+so
+nur
+über
+durch
+um
+am
+einen
+aber
+noch
+mehr
+einem
+bis
+dass
+vor
+daß
+dieser
+wenn
+diese
+vom
+hier
+unter
+dann
+was
+keine
+eines
+ab
+da
+schon
+sehr
+diesem
+sowie
+
+u.
+bzw
+bzw.
+
diff --git a/src/main/resources/super-blacklist.txt b/src/main/resources/super-blacklist.txt
new file mode 100755
index 0000000..e69de29
diff --git a/src/test/java/org/netspeak/preprocessing/ContractionMapperTest.java b/src/test/java/org/netspeak/preprocessing/ContractionMapperTest.java
new file mode 100644
index 0000000..00905da
--- /dev/null
+++ b/src/test/java/org/netspeak/preprocessing/ContractionMapperTest.java
@@ -0,0 +1,94 @@
+package org.netspeak.preprocessing;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.function.BiConsumer;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+public class ContractionMapperTest {
+
+	public List<String> getContractionPatterns() {
+		List<String> patterns = new ArrayList<>();
+
+		patterns.add("i'm");
+		patterns.add("(he|she|it)'s");
+		patterns.add("(you|we|they)'re");
+		patterns.add("(i|you|he|she|it|we|they)'(d|ll|ve)");
+		patterns.add("y'all");
+		patterns.add("(have|has|had|do|does|did|is|are|ai|was|were|wo|would|ca|could|sha|must|need)n't");
+
+		return patterns;
+	}
+
+	@Test
+	public void contractionTest() {
+		final ContractionMapper mapper = new ContractionMapper(getContractionPatterns());
+
+		BiConsumer<String, String> test = (from, to) -> {
+			String actual = mapper.map(from, 100);
+			if (actual == to)
+				return;
+			if (to == null || actual == null || !to.contentEquals(actual)) {
+				Assert.fail("\"" + from + "\" was expected to map to \"" + to + "\" but was actually mapped to \""
+						+ actual + "\".");
+			}
+		};
+
+		test.accept("Tom", "Tom");
+		test.accept("Tom's bar", "Tom's bar");
+		test.accept("Tom 's bar", "Tom's bar");
+		test.accept("Tom ' s bar", "Tom's bar");
+		test.accept("Tom' s bar", "Tom's bar");
+		test.accept("Tom s bar", "Tom s bar"); // too little context, so leave it as is
+
+		test.accept("Charls' phone", "Charls' phone");
+		test.accept("Charls ' phone", "Charls' phone");
+		test.accept("Charls '", "Charls'");
+		test.accept("Charls 't", "Charls 't");
+
+		test.accept("he's nice", "he's nice");
+		test.accept("he' s nice", "he's nice");
+		test.accept("he ' s nice", "he's nice");
+		test.accept("he 's nice", "he's nice");
+		test.accept("he s nice", "he's nice");
+
+		test.accept("we'll do it", "we'll do it");
+		test.accept("we 'll do it", "we'll do it");
+		test.accept("we ' ll do it", "we'll do it");
+		test.accept("we' ll do it", "we'll do it");
+		test.accept("we ll do it", "we'll do it");
+		test.accept("well do it", "well do it"); // well well well
+
+		test.accept("dont", "don't");
+		test.accept("don't", "don't");
+		test.accept("don 't", "don't");
+		test.accept("don ' t", "don't");
+		test.accept("don' t", "don't");
+		test.accept("don t", "don't");
+
+		test.accept("DoNt", "DoN't");
+		test.accept("DoN't", "DoN't");
+		test.accept("DoN 't", "DoN't");
+		test.accept("DoN ' t", "DoN't");
+		test.accept("DoN' t", "DoN't");
+		test.accept("DoN t", "DoN't");
+
+		test.accept("I'm", "I'm");
+		test.accept("I 'm", "I'm");
+		test.accept("I ' m", "I'm");
+		test.accept("I' m", "I'm");
+		test.accept("I m", "I'm");
+
+		test.accept("I might", "I might");
+
+		test.accept("won", "won");
+		test.accept("won'", null);
+		test.accept("won '", null);
+		test.accept("'t open", null);
+		test.accept("' t open", null);
+		test.accept("t open", "t open"); // could be real
+	}
+
+}
diff --git a/src/test/java/org/netspeak/preprocessing/PhraseMappersTest.java b/src/test/java/org/netspeak/preprocessing/PhraseMappersTest.java
new file mode 100644
index 0000000..96c9339
--- /dev/null
+++ b/src/test/java/org/netspeak/preprocessing/PhraseMappersTest.java
@@ -0,0 +1,137 @@
+package org.netspeak.preprocessing;
+
+import static org.junit.Assert.assertEquals;
+
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+
+import org.junit.Test;
+
+public class PhraseMappersTest {
+
+	private void phraseMapperTest(PhraseMapper mapper, Collection<String> unchanged, Collection<String> removed,
+			Map<String, String> changed) {
+		String name = mapper.getName();
+
+		if (unchanged != null) {
+			for (String expected : unchanged) {
+				String actual = mapper.map(expected, 100);
+				assertEquals("Expected unchanged for " + name, expected, actual);
+			}
+		}
+
+		if (removed != null) {
+			for (String expected : removed) {
+				String actual = mapper.map(expected, 100);
+				assertEquals("Expected removed for " + name, null, actual);
+			}
+		}
+
+		if (changed != null) {
+			for (Map.Entry<String, String> transform : changed.entrySet()) {
+				String actual = mapper.map(transform.getKey(), 100);
+				assertEquals("Expected changed for " + name, transform.getValue(), actual);
+			}
+		}
+	}
+
+	@Test
+	public void blacklist() {
+		Set<String> blacklistedWords = new HashSet<>();
+		for (String word : ". - ( ) \" '".split(" ")) {
+			blacklistedWords.add(word);
+		}
+
+		final Collection<String> sharedUnchanged = new ArrayList<>();
+		sharedUnchanged.add("foo bar");
+		sharedUnchanged.add("foo-bar");
+		sharedUnchanged.add("Dr.");
+
+		final Collection<String> sharedRemoved = new ArrayList<>();
+		sharedRemoved.add(".");
+		sharedRemoved.add(".");
+		sharedRemoved.add("(");
+		sharedRemoved.add(")");
+		sharedRemoved.add("-");
+		sharedRemoved.add("foo -");
+		sharedRemoved.add("- foo");
+		sharedRemoved.add("- foo -");
+		sharedRemoved.add("foo - bar");
+
+		{
+			final PhraseMapper mapper = PhraseMappers.blacklist(blacklistedWords, 1);
+
+			final Collection<String> unchanged = new ArrayList<>(sharedUnchanged);
+			unchanged.add("()");
+
+			final Collection<String> removed = new ArrayList<>(sharedRemoved);
+
+			phraseMapperTest(mapper, unchanged, removed, null);
+		}
+		{
+			final PhraseMapper mapper = PhraseMappers.blacklist(blacklistedWords, 4);
+
+			final Collection<String> unchanged = new ArrayList<>();
+			unchanged.add("()()-");
+
+			final Collection<String> removed = new ArrayList<>();
+			removed.add("()()");
+			removed.add("-.-.");
+			removed.add("-.-. foo");
+			removed.add("foo -.-. foo");
+
+			phraseMapperTest(mapper, unchanged, removed, null);
+		}
+	}
+
+	@Test
+	public void superBlacklist() {
+		Set<String> blacklistedWords = new HashSet<>();
+		for (String word : ". - ( ) \" '".split(" ")) {
+			blacklistedWords.add(word);
+		}
+
+		final Collection<String> sharedUnchanged = new ArrayList<>();
+		sharedUnchanged.add("foo bar");
+		sharedUnchanged.add("foo-bar");
+		sharedUnchanged.add("Dr.");
+
+		final Collection<String> sharedRemoved = new ArrayList<>();
+		sharedRemoved.add(".");
+		sharedRemoved.add(".");
+		sharedRemoved.add("(");
+		sharedRemoved.add(")");
+		sharedRemoved.add("-");
+		sharedRemoved.add("foo -");
+		sharedRemoved.add("- foo");
+		sharedRemoved.add("- foo -");
+		sharedRemoved.add("foo - bar");
+
+		{
+			final PhraseMapper mapper = PhraseMappers.superBlacklist(blacklistedWords);
+
+			final Collection<String> unchanged = new ArrayList<>();
+			unchanged.add("foo bar");
+
+			final Collection<String> removed = new ArrayList<>();
+			removed.add(".");
+			removed.add(".");
+			removed.add("(");
+			removed.add(")");
+			removed.add("-");
+			removed.add("foo -");
+			removed.add("- foo");
+			removed.add("- foo -");
+			removed.add("foo - bar");
+
+			removed.add("foo-bar");
+			removed.add("Dr.");
+
+			phraseMapperTest(mapper, unchanged, removed, null);
+		}
+	}
+
+}