From 7596043cec9c10602432d530ca2897bb197e551b Mon Sep 17 00:00:00 2001 From: RunDevelopment Date: Tue, 5 May 2020 23:48:02 +0200 Subject: [PATCH] Initial commit --- .gitignore | 9 + LICENSE | 21 + README.md | 174 +++++++ artifactory.gradle | 92 ++++ build.gradle | 53 +++ gradle/wrapper/gradle-wrapper.jar | Bin 0 -> 54413 bytes gradle/wrapper/gradle-wrapper.properties | 6 + gradlew | 172 +++++++ gradlew.bat | 84 ++++ src/main/java/org/netspeak/Util.java | 238 ++++++++++ .../org/netspeak/io/GoogleBooksCsvReader.java | 130 ++++++ .../org/netspeak/io/PhraseFrequencyPair.java | 40 ++ .../java/org/netspeak/io/PhraseReader.java | 22 + .../java/org/netspeak/io/PhraseWriter.java | 24 + .../java/org/netspeak/io/SimpleCsvReader.java | 61 +++ .../java/org/netspeak/io/SimpleCsvWriter.java | 31 ++ .../org/netspeak/io/SplitterCsvWriter.java | 85 ++++ .../preprocessing/ContractionMapper.java | 246 ++++++++++ .../preprocessing/HyphenationJoiner.java | 337 ++++++++++++++ .../netspeak/preprocessing/Operations.java | 198 ++++++++ .../netspeak/preprocessing/PhraseMapper.java | 77 ++++ .../netspeak/preprocessing/PhraseMappers.java | 436 ++++++++++++++++++ .../netspeak/preprocessing/PhraseSource.java | 87 ++++ .../org/netspeak/preprocessing/Pipeline.java | 28 ++ .../netspeak/preprocessing/PipelineItem.java | 8 + .../netspeak/preprocessing/Preprocessing.java | 371 +++++++++++++++ .../preprocessing/PreprocessingOptions.java | 87 ++++ .../preprocessing/SimplePhraseSource.java | 199 ++++++++ .../preprocessing/VocabularyExtractor.java | 99 ++++ .../netspeak/usage/NetspeakBuilderUsage.java | 46 ++ .../org/netspeak/usage/NetspeakTerminal.java | 61 +++ .../org/netspeak/usage/NetspeakUsage.java | 112 +++++ .../netspeak/usage/PreprocessingUsage.java | 152 ++++++ src/main/resources/blacklist.txt | 67 +++ src/main/resources/eng/contractions.txt | 9 + src/main/resources/ger/stop-words.txt | 68 +++ src/main/resources/super-blacklist.txt | 0 .../preprocessing/ContractionMapperTest.java | 94 ++++ .../preprocessing/PhraseMappersTest.java | 137 ++++++ 39 files changed, 4161 insertions(+) create mode 100755 .gitignore create mode 100755 LICENSE create mode 100755 README.md create mode 100755 artifactory.gradle create mode 100755 build.gradle create mode 100755 gradle/wrapper/gradle-wrapper.jar create mode 100755 gradle/wrapper/gradle-wrapper.properties create mode 100755 gradlew create mode 100755 gradlew.bat create mode 100755 src/main/java/org/netspeak/Util.java create mode 100644 src/main/java/org/netspeak/io/GoogleBooksCsvReader.java create mode 100644 src/main/java/org/netspeak/io/PhraseFrequencyPair.java create mode 100644 src/main/java/org/netspeak/io/PhraseReader.java create mode 100644 src/main/java/org/netspeak/io/PhraseWriter.java create mode 100644 src/main/java/org/netspeak/io/SimpleCsvReader.java create mode 100755 src/main/java/org/netspeak/io/SimpleCsvWriter.java create mode 100644 src/main/java/org/netspeak/io/SplitterCsvWriter.java create mode 100755 src/main/java/org/netspeak/preprocessing/ContractionMapper.java create mode 100755 src/main/java/org/netspeak/preprocessing/HyphenationJoiner.java create mode 100644 src/main/java/org/netspeak/preprocessing/Operations.java create mode 100755 src/main/java/org/netspeak/preprocessing/PhraseMapper.java create mode 100755 src/main/java/org/netspeak/preprocessing/PhraseMappers.java create mode 100644 src/main/java/org/netspeak/preprocessing/PhraseSource.java create mode 100644 src/main/java/org/netspeak/preprocessing/Pipeline.java create mode 100644 src/main/java/org/netspeak/preprocessing/PipelineItem.java create mode 100755 src/main/java/org/netspeak/preprocessing/Preprocessing.java create mode 100644 src/main/java/org/netspeak/preprocessing/PreprocessingOptions.java create mode 100644 src/main/java/org/netspeak/preprocessing/SimplePhraseSource.java create mode 100644 src/main/java/org/netspeak/preprocessing/VocabularyExtractor.java create mode 100644 src/main/java/org/netspeak/usage/NetspeakBuilderUsage.java create mode 100755 src/main/java/org/netspeak/usage/NetspeakTerminal.java create mode 100755 src/main/java/org/netspeak/usage/NetspeakUsage.java create mode 100644 src/main/java/org/netspeak/usage/PreprocessingUsage.java create mode 100644 src/main/resources/blacklist.txt create mode 100644 src/main/resources/eng/contractions.txt create mode 100644 src/main/resources/ger/stop-words.txt create mode 100755 src/main/resources/super-blacklist.txt create mode 100644 src/test/java/org/netspeak/preprocessing/ContractionMapperTest.java create mode 100644 src/test/java/org/netspeak/preprocessing/PhraseMappersTest.java diff --git a/.gitignore b/.gitignore new file mode 100755 index 0000000..dbfe783 --- /dev/null +++ b/.gitignore @@ -0,0 +1,9 @@ +bin +build +.gradle +.ideaout/ +.idea/ +.settings/ +*.iml +.project +.classpath diff --git a/LICENSE b/LICENSE new file mode 100755 index 0000000..5eeca62 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT LICENSE + +Copyright (c) 2019 Webis group + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/README.md b/README.md new file mode 100755 index 0000000..2e466b0 --- /dev/null +++ b/README.md @@ -0,0 +1,174 @@ +# Netspeak 4 indexing + +This project contains all necessities to create a new Netspeak 4 index. + +This project is mainly intended for developers that want to build a new Netspeak 4 index from a given data set. + + +--- + +## Contributors + +Michael Schmidt (2018 - 2020) + +Martin Trenkmann (2008 - 2013) + +Martin Potthast (2008 - 2020) + +Benno Stein (2008 - 2020) + + + +--- + +# Old Notes + +% NETSPEAK 4 JAVA NOTES +% martin.trenkmann@uni-weimar.de +% November 22, 2013 + + + +Notation +-------- + + # Does need admin permissions (sudo). + $ Does not need admin permissions. + + +Project description +------------------- + + + + +Library dependencies +-------------------- + +This Java project is a language binding for the C++ project netspeak4-application-cpp whose +implementation comes in form of a shared library (.so file). The present Java +application loads the library at runtime and invokes their native routines via +the Java Native Interface (JNI) method. Precompiled libraries for Ubuntu 10.04 +and 12.04 can be found in the lib sub-directory of this project. The native +library itself has some dependencies you need to install as well. To do so run +the following script: + + # /build/install-dependencies.sh + + +Build and install the native library +------------------------------------ + +In the case that there is no precompiled native library available for your +platform, you need to compile the corresponding C++ project by yourself. + +- Checkout netspeak4-application-cpp from webis CVS. +- Build target "Library" with Qt Creator IDE. + +# cp /lib//.so /usr/lib + + +Load native library +------------------- + +Set "-Djava.library.path=/usr/lib" as VM argument. + + +Build Netspeak from n-gram collection +------------------------------------- + +To build Netspeak from a collection of n-grams you have to provide a dedicated +directory with one or more text files as input. Each of these files have to +list a number of n-grams together with their frequencies, one by line. The +format of a single line is defined as follows: + + word_1 SPACE word_2 SPACE ... word_n TAB frequency + +In words: Each line defines an n-gram with its frequency. The delimiter between +the n-gram and the frequency is a single tabulator ('\t'). The delimiter to +separate the n-gram's words is a single whitespace (' '). + +Note: Follow this specification strictly to prevent parsing errors. In +particular, ensure the single `\t` delimiter between n-gram and frequency. + + +Getting Started +--------------- + +- `usage.NetspeakBuilderUsage.java` shows how to build Netspeak from a + collection of n-grams. +- `usage.NetspeakTerminal.java` runs a simple command line to search a Netspeak + instance interactively for testing purposes. +- `usage.NetspeakUsage.java` demonstrates how to search Netspeak in more detail + using the Request and Response objects. + +In some cases, if your local hardware, storage space or operating system +(Netspeak runs only on Linux) does not fit, it might be necessary to setup +Netspeak running on a Linux server and to request that instance remotely. + +For that reason build your Netspeak application as usual and run it as a Java +servlet, e.g. with Tomcat, using the project `netspeak4-server`. A running +Netspeak server can then be requested with `netspeak3-client-java` project from +any Java application. + + +Netspeak query language +----------------------- + +The Netspeak query syntax as described here should be used as reference. There +might be other syntax information out there, e.g. at netspeak.org, which +provides some syntactical simplifications in form of easier to use wildcards or +operators. However, these modified syntaxes are just front-ends and do not work +with the original Netspeak interface. Here is the truth: + + ? is a placeholder for exactly one word and can be sequenced to search for + exaclty two, three, four ... words. + + Example: how to ? this + -> how to use this + -> how to do this + -> how to cite this + + * is a placeholder for zero or many words. + + Example: see * works + -> see how it works + -> see if it works + -> see what works + + [] compares options, i.e. it checks each word or phrase between these + brackets plus the so called empty word at that position in the query. + + Example: it's [ great well "so good" ] + -> it's + -> it's great + -> it's well + -> it's so good + + {} checks the order, i.e. it tries to find each permutation of the given + sequence of words or phrases at that position in the query. + + Example: for { "very important people" only } + -> for very important people only + -> for only very important people + + # searches for alternatives of the word following. This operator requests + the optional Netspeak hash-dictionary component and uses [] to compare + each retrieved alternative (except that the empty word is not checked). + The mapping from word to alternatives is completely up to the user when + building Netspeak, for netspeak.org we use this operator for a synonym + search providing the Wordnet dictionary. + + Example: waiting for #response + -> waiting for response + -> waiting for answer + -> waiting for reply + +You can combine the introduced wildcards and operators as you want, but with the +exception that you may not place any wildcard within bracket operators. Also +nested brackets are not allowed. As you can see in the examples above you can +quote phrases to be handled as one entity is `[]` and `{}`. + + + +% Compile via: pandoc from.txt > to.html diff --git a/artifactory.gradle b/artifactory.gradle new file mode 100755 index 0000000..ff96511 --- /dev/null +++ b/artifactory.gradle @@ -0,0 +1,92 @@ +// Fetch Artifactory publishing plugin +buildscript { + repositories { + jcenter() + } + dependencies { + classpath 'org.jfrog.buildinfo:build-info-extractor-gradle:4+' + } +} + +// Apply plugins +apply plugin: 'maven-publish' +apply plugin: org.jfrog.gradle.plugin.artifactory.ArtifactoryPlugin + +// Determine which repositories to pull from and publish to +def pullRelease = 'libs-release' +def pullSnapshot = 'libs-snapshot' +def pushRelease = 'libs-snapshot-webis-gradle' +def pushSnapshot = 'libs-release-webis-gradle' + +if (project.ext.has("nonFree") && project.ext.get("nonFree")) { + pullRelease += '-nonfree' + pullSnapshot += '-nonfree' + pushRelease += '-nonfree' + pushSnapshot += '-nonfree' +} + +repositories { + maven { + url = 'https://repo.webis.de/artifactory/' + pullRelease + credentials { + username = project.findProperty("artifactoryUsername") ?: "" + password = project.findProperty("artifactoryPassword") ?: "" + } + } + maven { + url = 'https://repo.webis.de/artifactory/' + pullSnapshot + credentials { + username = project.findProperty("artifactoryUsername") ?: "" + password = project.findProperty("artifactoryPassword") ?: "" + } + } +} + +// Configure Artifactory remote +artifactory { + contextUrl = "https://repo.webis.de/artifactory" + publish { + repository { + repoKey = version.endsWith('SNAPSHOT') ? pushRelease : pushSnapshot + username = project.findProperty("artifactoryUsername") ?: "" + password = project.findProperty("artifactoryPassword") ?: "" + maven = true + } + defaults { + publications('mavenJava') + } + } +} + +// Create tasks for generating source and JavaDoc JARs +task sourcesJar(type: Jar, dependsOn: classes) { + classifier = 'sources' + from sourceSets.main.allSource +} + +task javadocJar(type: Jar, dependsOn: javadoc) { + classifier = 'javadoc' + from javadoc.destinationDir +} + +artifacts { + archives javadocJar + archives sourcesJar +} + +// Configure Maven Publishing Information +publishing { + publications { + mavenJava(MavenPublication) { + // Publish binary, source, and JavaDoc JARs + from components.java + artifact sourcesJar + artifact javadocJar + + // Set POM definition + if (project.ext.has("pomDef")) { + pom project.ext.get("pomDef") + } + } + } +} diff --git a/build.gradle b/build.gradle new file mode 100755 index 0000000..339432d --- /dev/null +++ b/build.gradle @@ -0,0 +1,53 @@ +// Apply plugins +apply plugin: 'java' +apply plugin: 'jacoco' +apply plugin: 'application' + +// Basic configuration and settings for all (sub-)projects +allprojects { + group = 'org.netspeak' + version = '1.0' + mainClassName = 'org.netspeak.usage.NetspeakTerminal' + sourceCompatibility = 1.8 + targetCompatibility = 1.8 + + // Set source file encoding + compileJava.options.encoding = "UTF-8" + compileTestJava.options.encoding = "UTF-8" + javadoc.options.encoding = 'UTF-8' + + // Declare global dependencies + dependencies { + compile group: 'org.netspeak', name: 'netspeak4-application-java', version: '1.0' + compile group: 'org.apache.commons', name: 'commons-compress', version: '1.19' + + testImplementation 'junit:junit:4.12' + } + + // Set MANIFEST.MF contents + jar { + manifest { + attributes('Main-Class': mainClassName) + } + } +} + +// Set POM definition +project.ext.pomDef = { + name = 'Netspeak 4 stuff' + description = 'An application with lots of miscellaneous functionality related to Netspeak 4' + url = 'http://netspeak.org' + //licenses { + // license { + // name = 'The Apache License, Version 2.0' + // url = 'http://www.apache.org/licenses/LICENSE-2.0.txt' + // } + //} + organization { + name = 'Netspeak' + url = 'http://netspeak.org' + } +} + +// Include Artifactory configuration +apply from: 'artifactory.gradle' diff --git a/gradle/wrapper/gradle-wrapper.jar b/gradle/wrapper/gradle-wrapper.jar new file mode 100755 index 0000000000000000000000000000000000000000..0d4a9516871afd710a9d84d89e31ba77745607bd GIT binary patch literal 54413 zcmafaV|Zr4wq`oEZQHiZj%|LijZQlLf{tz5M#r{o+fI6V=G-$g=gzrzeyqLskF}nv zRZs0&c;EUi2L_G~0s;*U0szbL-0C3_3~ zRZ#mYf6f1oqJoH`jHHCB8l!^by~4z}yc`4LEP@;Z?bO6{g9`Hk+s@(L1jC5Tq{1Yf z4E;CQvrx0-gF+peRxFC*gF=&$zNYjO?K|gN=WqXMz`tYs@0o%B{dRD+{C_6(f9t^g zhmNJQv6-#;f2)f2uc{u-#*U8W&i{|ewYN^n_1~cv|1J!}zc&$eaBy{T{cEpa46s*q zHFkD2cV;xTHFj}{*3kBt*FgS4A5SI|$F%$gB@It9FlC}D3y`sbZG{2P6gGwC$U`6O zb_cId9AhQl#A<&=x>-xDD%=Ppt$;y71@Lwsl{x943#T@8*?cbR<~d`@@}4V${+r$jICUIOzgZJy_9I zu*eA(F)$~J07zX%tmQN}1^wj+RM|9bbwhQA=xrPE*{vB_P!pPYT5{Or^m*;Qz#@Bl zRywCG_RDyM6bf~=xn}FtiFAw|rrUxa1+z^H`j6e|GwKDuq}P)z&@J>MEhsVBvnF|O zOEm)dADU1wi8~mX(j_8`DwMT_OUAnjbWYer;P*^Uku_qMu3}qJU zTAkza-K9aj&wcsGuhQ>RQoD?gz~L8RwCHOZDzhBD$az*$TQ3!uygnx_rsXG`#_x5t zn*lb(%JI3%G^MpYp-Y(KI4@_!&kBRa3q z|Fzn&3R%ZsoMNEn4pN3-BSw2S_{IB8RzRv(eQ1X zyBQZHJ<(~PfUZ~EoI!Aj`9k<+Cy z2DtI<+9sXQu!6&-Sk4SW3oz}?Q~mFvy(urUy<)x!KQ>#7yIPC)(ORhKl7k)4eSy~} z7#H3KG<|lt68$tk^`=yjev%^usOfpQ#+Tqyx|b#dVA(>fPlGuS@9ydo z!Cs#hse9nUETfGX-7lg;F>9)+ml@M8OO^q|W~NiysX2N|2dH>qj%NM`=*d3GvES_# zyLEHw&1Fx<-dYxCQbk_wk^CI?W44%Q9!!9aJKZW-bGVhK?N;q`+Cgc*WqyXcxZ%U5QXKu!Xn)u_dxeQ z;uw9Vysk!3OFzUmVoe)qt3ifPin0h25TU zrG*03L~0|aaBg7^YPEW^Yq3>mSNQgk-o^CEH?wXZ^QiPiuH}jGk;75PUMNquJjm$3 zLcXN*uDRf$Jukqg3;046b;3s8zkxa_6yAlG{+7{81O3w96i_A$KcJhD&+oz1<>?lun#C3+X0q zO4JxN{qZ!e#FCl@e_3G?0I^$CX6e$cy7$BL#4<`AA)Lw+k`^15pmb-447~5lkSMZ` z>Ce|adKhb-F%yy!vx>yQbXFgHyl(an=x^zi(!-~|k;G1=E(e@JgqbAF{;nv`3i)oi zDeT*Q+Mp{+NkURoabYb9@#Bi5FMQnBFEU?H{~9c;g3K%m{+^hNe}(MdpPb?j9`?2l z#%AO!|2QxGq7-2Jn2|%atvGb(+?j&lmP509i5y87`9*BSY++<%%DXb)kaqG0(4Eft zj|2!Od~2TfVTi^0dazAIeVe&b#{J4DjN6;4W;M{yWj7#+oLhJyqeRaO;>?%mX>Ec{Mp~;`bo}p;`)@5dA8fNQ38FyMf;wUPOdZS{U*8SN6xa z-kq3>*Zos!2`FMA7qjhw-`^3ci%c91Lh`;h{qX1r;x1}eW2hYaE*3lTk4GwenoxQ1kHt1Lw!*N8Z%DdZSGg5~Bw}+L!1#d$u+S=Bzo7gi zqGsBV29i)Jw(vix>De)H&PC; z-t2OX_ak#~eSJ?Xq=q9A#0oaP*dO7*MqV;dJv|aUG00UX=cIhdaet|YEIhv6AUuyM zH1h7fK9-AV)k8sr#POIhl+?Z^r?wI^GE)ZI=H!WR<|UI(3_YUaD#TYV$Fxd015^mT zpy&#-IK>ahfBlJm-J(n(A%cKV;)8&Y{P!E|AHPtRHk=XqvYUX?+9po4B$0-6t74UUef${01V{QLEE8gzw* z5nFnvJ|T4dlRiW9;Ed_yB{R@)fC=zo4hCtD?TPW*WJmMXYxN_&@YQYg zBQ$XRHa&EE;YJrS{bn7q?}Y&DH*h;){5MmE(9A6aSU|W?{3Ox%5fHLFScv7O-txuRbPG1KQtI`Oay=IcEG=+hPhlnYC;`wSHeo|XGio0aTS6&W($E$ z?N&?TK*l8;Y^-xPl-WVZwrfdiQv10KdsAb9u-*1co*0-Z(h#H)k{Vc5CT!708cs%sExvPC+7-^UY~jTfFq=cj z!Dmy<+NtKp&}}$}rD{l?%MwHdpE(cPCd;-QFPk1`E5EVNY2i6E`;^aBlx4}h*l42z zpY#2cYzC1l6EDrOY*ccb%kP;k8LHE3tP>l3iK?XZ%FI<3666yPw1rM%>eCgnv^JS_ zK7c~;g7yXt9fz@(49}Dj7VO%+P!eEm& z;z8UXs%NsQ%@2S5nve)@;yT^61BpVlc}=+i6{ZZ9r7<({yUYqe==9*Z+HguP3`sA& z{`inI4G)eLieUQ*pH9M@)u7yVnWTQva;|xq&-B<>MoP(|xP(HqeCk1&h>DHNLT>Zi zQ$uH%s6GoPAi0~)sC;`;ngsk+StYL9NFzhFEoT&Hzfma1f|tEnL0 zMWdX4(@Y*?*tM2@H<#^_l}BC&;PYJl%~E#veQ61{wG6!~nyop<^e)scV5#VkGjYc2 z$u)AW-NmMm%T7WschOnQ!Hbbw&?`oMZrJ&%dVlN3VNra1d0TKfbOz{dHfrCmJ2Jj= zS#Gr}JQcVD?S9X!u|oQ7LZ+qcq{$40 ziG5=X^+WqeqxU00YuftU7o;db=K+Tq!y^daCZgQ)O=M} zK>j*<3oxs=Rcr&W2h%w?0Cn3);~vqG>JO_tTOzuom^g&^vzlEjkx>Sv!@NNX%_C!v zaMpB>%yVb}&ND9b*O>?HxQ$5-%@xMGe4XKjWh7X>CYoRI2^JIwi&3Q5UM)?G^k8;8 zmY$u;(KjZx>vb3fe2zgD7V;T2_|1KZQW$Yq%y5Ioxmna9#xktcgVitv7Sb3SlLd6D zfmBM9Vs4rt1s0M}c_&%iP5O{Dnyp|g1(cLYz^qLqTfN6`+o}59Zlu%~oR3Q3?{Bnr zkx+wTpeag^G12fb_%SghFcl|p2~<)Av?Agumf@v7y-)ecVs`US=q~=QG%(_RTsqQi z%B&JdbOBOmoywgDW|DKR5>l$1^FPhxsBrja<&}*pfvE|5dQ7j-wV|ur%QUCRCzBR3q*X`05O3U@?#$<>@e+Zh&Z&`KfuM!0XL& zI$gc@ZpM4o>d&5)mg7+-Mmp98K^b*28(|Ew8kW}XEV7k^vnX-$onm9OtaO@NU9a|as7iA%5Wrw9*%UtJYacltplA5}gx^YQM` zVkn`TIw~avq)mIQO0F0xg)w$c)=8~6Jl|gdqnO6<5XD)&e7z7ypd3HOIR+ss0ikSVrWar?548HFQ*+hC)NPCq*;cG#B$7 z!n?{e9`&Nh-y}v=nK&PR>PFdut*q&i81Id`Z<0vXUPEbbJ|<~_D!)DJMqSF~ly$tN zygoa)um~xdYT<7%%m!K8+V(&%83{758b0}`b&=`))Tuv_)OL6pf=XOdFk&Mfx9y{! z6nL>V?t=#eFfM$GgGT8DgbGRCF@0ZcWaNs_#yl+6&sK~(JFwJmN-aHX{#Xkpmg;!} zgNyYYrtZdLzW1tN#QZAh!z5>h|At3m+ryJ-DFl%V>w?cmVTxt^DsCi1ZwPaCe*D{) z?#AZV6Debz{*D#C2>44Czy^yT3y92AYDcIXtZrK{L-XacVl$4i=X2|K=Fy5vAzhk{ zu3qG=qSb_YYh^HirWf~n!_Hn;TwV8FU9H8+=BO)XVFV`nt)b>5yACVr!b98QlLOBDY=^KS<*m9@_h3;64VhBQzb_QI)gbM zSDto2i*iFrvxSmAIrePB3i`Ib>LdM8wXq8(R{-)P6DjUi{2;?}9S7l7bND4w%L2!; zUh~sJ(?Yp}o!q6)2CwG*mgUUWlZ;xJZo`U`tiqa)H4j>QVC_dE7ha0)nP5mWGB268 zn~MVG<#fP#R%F=Ic@(&Va4dMk$ysM$^Avr1&hS!p=-7F>UMzd(M^N9Ijb|364}qcj zcIIh7suk$fQE3?Z^W4XKIPh~|+3(@{8*dSo&+Kr(J4^VtC{z*_{2}ld<`+mDE2)S| zQ}G#Q0@ffZCw!%ZGc@kNoMIdQ?1db%N1O0{IPPesUHI;(h8I}ETudk5ESK#boZgln z(0kvE`&6z1xH!s&={%wQe;{^&5e@N0s7IqR?L*x%iXM_czI5R1aU?!bA7)#c4UN2u zc_LZU+@elD5iZ=4*X&8%7~mA;SA$SJ-8q^tL6y)d150iM)!-ry@TI<=cnS#$kJAS# zq%eK**T*Wi2OlJ#w+d_}4=VN^A%1O+{?`BK00wkm)g8;u?vM;RR+F1G?}({ENT3i= zQsjJkp-dmJ&3-jMNo)wrz0!g*1z!V7D(StmL(A}gr^H-CZ~G9u?*Uhcx|x7rb`v^X z9~QGx;wdF4VcxCmEBp$F#sms@MR?CF67)rlpMxvwhEZLgp2?wQq|ci#rLtrYRV~iR zN?UrkDDTu114&d~Utjcyh#tXE_1x%!dY?G>qb81pWWH)Ku@Kxbnq0=zL#x@sCB(gs zm}COI(!{6-XO5li0>1n}Wz?w7AT-Sp+=NQ1aV@fM$`PGZjs*L+H^EW&s!XafStI!S zzgdntht=*p#R*o8-ZiSb5zf6z?TZr$^BtmIfGAGK;cdg=EyEG)fc*E<*T=#a?l=R5 zv#J;6C(umoSfc)W*EODW4z6czg3tXIm?x8{+8i^b;$|w~k)KLhJQnNW7kWXcR^sol z1GYOp?)a+}9Dg*nJ4fy*_riThdkbHO37^csfZRGN;CvQOtRacu6uoh^gg%_oEZKDd z?X_k67s$`|Q&huidfEonytrq!wOg07H&z@`&BU6D114p!rtT2|iukF}>k?71-3Hk< zs6yvmsMRO%KBQ44X4_FEYW~$yx@Y9tKrQ|rC1%W$6w}-9!2%4Zk%NycTzCB=nb)r6*92_Dg+c0;a%l1 zsJ$X)iyYR2iSh|%pIzYV1OUWER&np{w1+RXb~ zMUMRymjAw*{M)UtbT)T!kq5ZAn%n=gq3ssk3mYViE^$paZ;c^7{vXDJ`)q<}QKd2?{r9`X3mpZ{AW^UaRe2^wWxIZ$tuyKzp#!X-hXkHwfD zj@2tA--vFi3o_6B?|I%uwD~emwn0a z+?2Lc1xs(`H{Xu>IHXpz=@-84uw%dNV;{|c&ub|nFz(=W-t4|MME(dE4tZQi?0CE|4_?O_dyZj1)r zBcqB8I^Lt*#)ABdw#yq{OtNgf240Jvjm8^zdSf40 z;H)cp*rj>WhGSy|RC5A@mwnmQ`y4{O*SJ&S@UFbvLWyPdh)QnM=(+m3p;0&$^ysbZ zJt!ZkNQ%3hOY*sF2_~-*`aP|3Jq7_<18PX*MEUH*)t{eIx%#ibC|d&^L5FwoBN}Oe z?!)9RS@Zz%X1mqpHgym75{_BM4g)k1!L{$r4(2kL<#Oh$Ei7koqoccI3(MN1+6cDJ zp=xQhmilz1?+ZjkX%kfn4{_6K_D{wb~rdbkh!!k!Z@cE z^&jz55*QtsuNSlGPrU=R?}{*_8?4L7(+?>?(^3Ss)f!ou&{6<9QgH>#2$?-HfmDPN z6oIJ$lRbDZb)h-fFEm^1-v?Slb8udG{7GhbaGD_JJ8a9f{6{TqQN;m@$&)t81k77A z?{{)61za|e2GEq2)-OqcEjP`fhIlUs_Es-dfgX-3{S08g`w=wGj2{?`k^GD8d$}6Z zBT0T1lNw~fuwjO5BurKM593NGYGWAK%UCYiq{$p^GoYz^Uq0$YQ$j5CBXyog8(p_E znTC+$D`*^PFNc3Ih3b!2Lu|OOH6@46D)bbvaZHy%-9=$cz}V^|VPBpmPB6Ivzlu&c zPq6s7(2c4=1M;xlr}bkSmo9P`DAF>?Y*K%VPsY`cVZ{mN&0I=jagJ?GA!I;R)i&@{ z0Gl^%TLf_N`)`WKs?zlWolWvEM_?{vVyo(!taG$`FH2bqB`(o50pA=W34kl-qI62lt z1~4LG_j%sR2tBFteI{&mOTRVU7AH>>-4ZCD_p6;-J<=qrod`YFBwJz(Siu(`S}&}1 z6&OVJS@(O!=HKr-Xyzuhi;swJYK*ums~y1ePdX#~*04=b9)UqHHg;*XJOxnS6XK#j zG|O$>^2eW2ZVczP8#$C`EpcWwPFX4^}$omn{;P(fL z>J~%-r5}*D3$Kii z34r@JmMW2XEa~UV{bYP=F;Y5=9miJ+Jw6tjkR+cUD5+5TuKI`mSnEaYE2=usXNBs9 zac}V13%|q&Yg6**?H9D620qj62dM+&&1&a{NjF}JqmIP1I1RGppZ|oIfR}l1>itC% zl>ed${{_}8^}m2^br*AIX$L!Vc?Sm@H^=|LnpJg`a7EC+B;)j#9#tx-o0_e4!F5-4 zF4gA;#>*qrpow9W%tBzQ89U6hZ9g=-$gQpCh6Nv_I0X7t=th2ajJ8dBbh{i)Ok4{I z`Gacpl?N$LjC$tp&}7Sm(?A;;Nb0>rAWPN~@3sZ~0_j5bR+dz;Qs|R|k%LdreS3Nn zp*36^t#&ASm=jT)PIjNqaSe4mTjAzlAFr*@nQ~F+Xdh$VjHWZMKaI+s#FF#zjx)BJ zufxkW_JQcPcHa9PviuAu$lhwPR{R{7CzMUi49=MaOA%ElpK;A)6Sgsl7lw)D$8FwE zi(O6g;m*86kcJQ{KIT-Rv&cbv_SY4 zpm1|lSL*o_1LGOlBK0KuU2?vWcEcQ6f4;&K=&?|f`~X+s8H)se?|~2HcJo{M?Ity) zE9U!EKGz2^NgB6Ud;?GcV*1xC^1RYIp&0fr;DrqWLi_Kts()-#&3|wz{wFQsKfnnsC||T?oIgUp z{O(?Df7&vW!i#_~*@naguLLjDAz+)~*_xV2iz2?(N|0y8DMneikrT*dG`mu6vdK`% z=&nX5{F-V!Reau}+w_V3)4?}h@A@O)6GCY7eXC{p-5~p8x{cH=hNR;Sb{*XloSZ_%0ZKYG=w<|!vy?spR4!6mF!sXMUB5S9o_lh^g0!=2m55hGR; z-&*BZ*&;YSo474=SAM!WzrvjmNtq17L`kxbrZ8RN419e=5CiQ-bP1j-C#@@-&5*(8 zRQdU~+e(teUf}I3tu%PB1@Tr{r=?@0KOi3+Dy8}+y#bvgeY(FdN!!`Kb>-nM;7u=6 z;0yBwOJ6OdWn0gnuM{0`*fd=C(f8ASnH5aNYJjpbY1apTAY$-%)uDi$%2)lpH=#)=HH z<9JaYwPKil@QbfGOWvJ?cN6RPBr`f+jBC|-dO|W@x_Vv~)bmY(U(!cs6cnhe0z31O z>yTtL4@KJ*ac85u9|=LFST22~!lb>n7IeHs)_(P_gU}|8G>{D_fJX)8BJ;Se? z67QTTlTzZykb^4!{xF!=C}VeFd@n!9E)JAK4|vWVwWop5vSWcD<;2!88v-lS&ve7C zuYRH^85#hGKX(Mrk};f$j_V&`Nb}MZy1mmfz(e`nnI4Vpq(R}26pZx?fq%^|(n~>* z5a5OFtFJJfrZmgjyHbj1`9||Yp?~`p2?4NCwu_!!*4w8K`&G7U_|np&g7oY*-i;sI zu)~kYH;FddS{7Ri#Z5)U&X3h1$Mj{{yk1Q6bh4!7!)r&rqO6K~{afz@bis?*a56i& zxi#(Ss6tkU5hDQJ0{4sKfM*ah0f$>WvuRL zunQ-eOqa3&(rv4kiQ(N4`FO6w+nko_HggKFWx@5aYr}<~8wuEbD(Icvyl~9QL^MBt zSvD)*C#{2}!Z55k1ukV$kcJLtW2d~%z$t0qMe(%2qG`iF9K_Gsae7OO%Tf8E>ooch ztAw01`WVv6?*14e1w%Wovtj7jz_)4bGAqqo zvTD|B4)Ls8x7-yr6%tYp)A7|A)x{WcI&|&DTQR&2ir(KGR7~_RhNOft)wS<+vQ*|sf;d>s zEfl&B^*ZJp$|N`w**cXOza8(ARhJT{O3np#OlfxP9Nnle4Sto)Fv{w6ifKIN^f1qO*m8+MOgA1^Du!=(@MAh8)@wU8t=Ymh!iuT_lzfm za~xEazL-0xwy9$48!+?^lBwMV{!Gx)N>}CDi?Jwax^YX@_bxl*+4itP;DrTswv~n{ zZ0P>@EB({J9ZJ(^|ptn4ks^Z2UI&87d~J_^z0&vD2yb%*H^AE!w= zm&FiH*c%vvm{v&i3S>_hacFH${|(2+q!`X~zn4$aJDAry>=n|{C7le(0a)nyV{kAD zlud4-6X>1@-XZd`3SKKHm*XNn_zCyKHmf*`C_O509$iy$Wj`Sm3y?nWLCDy>MUx1x zl-sz7^{m(&NUk*%_0(G^>wLDnXW90FzNi$Tu6* z<+{ePBD`%IByu977rI^x;gO5M)Tfa-l*A2mU-#IL2?+NXK-?np<&2rlF;5kaGGrx2 zy8Xrz`kHtTVlSSlC=nlV4_oCsbwyVHG4@Adb6RWzd|Otr!LU=% zEjM5sZ#Ib4#jF(l!)8Na%$5VK#tzS>=05GpV?&o* z3goH1co0YR=)98rPJ~PuHvkA59KUi#i(Mq_$rApn1o&n1mUuZfFLjx@3;h`0^|S##QiTP8rD`r8P+#D@gvDJh>amMIl065I)PxT6Hg(lJ?X7*|XF2Le zv36p8dWHCo)f#C&(|@i1RAag->5ch8TY!LJ3(+KBmLxyMA%8*X%_ARR*!$AL66nF= z=D}uH)D)dKGZ5AG)8N-;Il*-QJ&d8u30&$_Q0n1B58S0ykyDAyGa+BZ>FkiOHm1*& zNOVH;#>Hg5p?3f(7#q*dL74;$4!t?a#6cfy#}9H3IFGiCmevir5@zXQj6~)@zYrWZ zRl*e66rjwksx-)Flr|Kzd#Bg>We+a&E{h7bKSae9P~ z(g|zuXmZ zD?R*MlmoZ##+0c|cJ(O{*h(JtRdA#lChYhfsx25(Z`@AK?Q-S8_PQqk z>|Z@Ki1=wL1_c6giS%E4YVYD|Y-{^ZzFwB*yN8-4#+TxeQ`jhks7|SBu7X|g=!_XL z`mY=0^chZfXm%2DYHJ4z#soO7=NONxn^K3WX={dV>$CTWSZe@<81-8DVtJEw#Uhd3 zxZx+($6%4a&y_rD8a&E`4$pD6-_zZJ%LEE*1|!9uOm!kYXW< zOBXZAowsX-&$5C`xgWkC43GcnY)UQt2Qkib4!!8Mh-Q!_M%5{EC=Gim@_;0+lP%O^ zG~Q$QmatQk{Mu&l{q~#kOD;T-{b1P5u7)o-QPPnqi?7~5?7%IIFKdj{;3~Hu#iS|j z)Zoo2wjf%+rRj?vzWz(6JU`=7H}WxLF*|?WE)ci7aK?SCmd}pMW<{#1Z!_7BmVP{w zSrG>?t}yNyCR%ZFP?;}e8_ zRy67~&u11TN4UlopWGj6IokS{vB!v!n~TJYD6k?~XQkpiPMUGLG2j;lh>Eb5bLTkX zx>CZlXdoJsiPx=E48a4Fkla>8dZYB%^;Xkd(BZK$z3J&@({A`aspC6$qnK`BWL;*O z-nRF{XRS`3Y&b+}G&|pE1K-Ll_NpT!%4@7~l=-TtYRW0JJ!s2C-_UsRBQ=v@VQ+4> z*6jF0;R@5XLHO^&PFyaMDvyo?-lAD(@H61l-No#t@at@Le9xOgTFqkc%07KL^&iss z!S2Ghm)u#26D(e1Q7E;L`rxOy-N{kJ zTgfw}az9=9Su?NEMMtpRlYwDxUAUr8F+P=+9pkX4%iA4&&D<|=B|~s*-U+q6cq`y* zIE+;2rD7&D5X;VAv=5rC5&nP$E9Z3HKTqIFCEV%V;b)Y|dY?8ySn|FD?s3IO>VZ&&f)idp_7AGnwVd1Z znBUOBA}~wogNpEWTt^1Rm-(YLftB=SU|#o&pT7vTr`bQo;=ZqJHIj2MP{JuXQPV7% z0k$5Ha6##aGly<}u>d&d{Hkpu?ZQeL_*M%A8IaXq2SQl35yW9zs4^CZheVgHF`%r= zs(Z|N!gU5gj-B^5{*sF>;~fauKVTq-Ml2>t>E0xl9wywD&nVYZfs1F9Lq}(clpNLz z4O(gm_i}!k`wUoKr|H#j#@XOXQ<#eDGJ=eRJjhOUtiKOG;hym-1Hu)1JYj+Kl*To<8( za1Kf4_Y@Cy>eoC59HZ4o&xY@!G(2p^=wTCV>?rQE`Upo^pbhWdM$WP4HFdDy$HiZ~ zRUJFWTII{J$GLVWR?miDjowFk<1#foE3}C2AKTNFku+BhLUuT>?PATB?WVLzEYyu+ zM*x((pGdotzLJ{}R=OD*jUexKi`mb1MaN0Hr(Wk8-Uj0zA;^1w2rmxLI$qq68D>^$ zj@)~T1l@K|~@YJ6+@1vlWl zHg5g%F{@fW5K!u>4LX8W;ua(t6YCCO_oNu}IIvI6>Fo@MilYuwUR?9p)rKNzDmTAN zzN2d>=Za&?Z!rJFV*;mJ&-sBV80%<-HN1;ciLb*Jk^p?u<~T25%7jjFnorfr={+wm zzl5Q6O>tsN8q*?>uSU6#xG}FpAVEQ_++@}G$?;S7owlK~@trhc#C)TeIYj^N(R&a} zypm~c=fIs;M!YQrL}5{xl=tUU-Tfc0ZfhQuA-u5(*w5RXg!2kChQRd$Fa8xQ0CQIU zC`cZ*!!|O!*y1k1J^m8IIi|Sl3R}gm@CC&;4840^9_bb9%&IZTRk#=^H0w%`5pMDCUef5 zYt-KpWp2ijh+FM`!zZ35>+7eLN;s3*P!bp%-oSx34fdTZ14Tsf2v7ZrP+mitUx$rS zW(sOi^CFxe$g3$x45snQwPV5wpf}>5OB?}&Gh<~i(mU&ss#7;utaLZ!|KaTHniGO9 zVC9OTzuMKz)afey_{93x5S*Hfp$+r*W>O^$2ng|ik!<`U1pkxm3*)PH*d#>7md1y} zs7u^a8zW8bvl92iN;*hfOc-=P7{lJeJ|3=NfX{(XRXr;*W3j845SKG&%N zuBqCtDWj*>KooINK1 zFPCsCWr!-8G}G)X*QM~34R*k zmRmDGF*QE?jCeNfc?k{w<}@29e}W|qKJ1K|AX!htt2|B`nL=HkC4?1bEaHtGBg}V( zl(A`6z*tck_F$4;kz-TNF%7?=20iqQo&ohf@S{_!TTXnVh}FaW2jxAh(DI0f*SDG- z7tqf5X@p#l?7pUNI(BGi>n_phw=lDm>2OgHx-{`T>KP2YH9Gm5ma zb{>7>`tZ>0d5K$j|s2!{^sFWQo3+xDb~#=9-jp(1ydI3_&RXGB~rxWSMgDCGQG)oNoc#>)td zqE|X->35U?_M6{^lB4l(HSN|`TC2U*-`1jSQeiXPtvVXdN-?i1?d#;pw%RfQuKJ|e zjg75M+Q4F0p@8I3ECpBhGs^kK;^0;7O@MV=sX^EJLVJf>L;GmO z3}EbTcoom7QbI(N8ad!z(!6$!MzKaajSRb0c+ZDQ($kFT&&?GvXmu7+V3^_(VJx1z zP-1kW_AB&_A;cxm*g`$ z#Pl@Cg{siF0ST2-w)zJkzi@X)5i@)Z;7M5ewX+xcY36IaE0#flASPY2WmF8St0am{ zV|P|j9wqcMi%r-TaU>(l*=HxnrN?&qAyzimA@wtf;#^%{$G7i4nXu=Pp2#r@O~wi)zB>@25A*|axl zEclXBlXx1LP3x0yrSx@s-kVW4qlF+idF+{M7RG54CgA&soDU-3SfHW@-6_ z+*;{n_SixmGCeZjHmEE!IF}!#aswth_{zm5Qhj0z-@I}pR?cu=P)HJUBClC;U+9;$#@xia30o$% zDw%BgOl>%vRenxL#|M$s^9X}diJ9q7wI1-0n2#6>@q}rK@ng(4M68(t52H_Jc{f&M9NPxRr->vj-88hoI?pvpn}llcv_r0`;uN>wuE{ z&TOx_i4==o;)>V4vCqG)A!mW>dI^Ql8BmhOy$6^>OaUAnI3>mN!Zr#qo4A>BegYj` zNG_)2Nvy2Cqxs1SF9A5HHhL7sai#Umw%K@+riaF+q)7&MUJvA&;$`(w)+B@c6!kX@ zzuY;LGu6|Q2eu^06PzSLspV2v4E?IPf`?Su_g8CX!75l)PCvyWKi4YRoRThB!-BhG zubQ#<7oCvj@z`^y&mPhSlbMf0<;0D z?5&!I?nV-jh-j1g~&R(YL@c=KB_gNup$8abPzXZN`N|WLqxlN)ZJ+#k4UWq#WqvVD z^|j+8f5uxTJtgcUscKTqKcr?5g-Ih3nmbvWvvEk})u-O}h$=-p4WE^qq7Z|rLas0$ zh0j&lhm@Rk(6ZF0_6^>Rd?Ni-#u1y`;$9tS;~!ph8T7fLlYE{P=XtWfV0Ql z#z{_;A%p|8+LhbZT0D_1!b}}MBx9`R9uM|+*`4l3^O(>Mk%@ha>VDY=nZMMb2TnJ= zGlQ+#+pmE98zuFxwAQcVkH1M887y;Bz&EJ7chIQQe!pgWX>(2ruI(emhz@_6t@k8Z zqFEyJFX2PO`$gJ6p$=ku{7!vR#u+$qo|1r;orjtp9FP^o2`2_vV;W&OT)acRXLN^m zY8a;geAxg!nbVu|uS8>@Gvf@JoL&GP`2v4s$Y^5vE32&l;2)`S%e#AnFI-YY7_>d#IKJI!oL6e z_7W3e=-0iz{bmuB*HP+D{Nb;rn+RyimTFqNV9Bzpa0?l`pWmR0yQOu&9c0S*1EPr1 zdoHMYlr>BycjTm%WeVuFd|QF8I{NPT&`fm=dITj&3(M^q ze2J{_2zB;wDME%}SzVWSW6)>1QtiX)Iiy^p2eT}Ii$E9w$5m)kv(3wSCNWq=#DaKZ zs%P`#^b7F-J0DgQ1?~2M`5ClYtYN{AlU|v4pEg4z03=g6nqH`JjQuM{k`!6jaIL_F zC;sn?1x?~uMo_DFg#ypNeie{3udcm~M&bYJ1LI zE%y}P9oCX3I1Y9yhF(y9Ix_=8L(p)EYr&|XZWCOb$7f2qX|A4aJ9bl7pt40Xr zXUT#NMBB8I@xoIGSHAZkYdCj>eEd#>a;W-?v4k%CwBaR5N>e3IFLRbDQTH#m_H+4b zk2UHVymC`%IqwtHUmpS1!1p-uQB`CW1Y!+VD!N4TT}D8(V0IOL|&R&)Rwj@n8g@=`h&z9YTPDT+R9agnwPuM!JW~=_ya~% zIJ*>$Fl;y7_`B7G4*P!kcy=MnNmR`(WS5_sRsvHF42NJ;EaDram5HwQ4Aw*qbYn0j;#)bh1lyKLg#dYjN*BMlh+fxmCL~?zB;HBWho;20WA==ci0mAqMfyG>1!HW zO7rOga-I9bvut1Ke_1eFo9tbzsoPTXDW1Si4}w3fq^Z|5LGf&egnw%DV=b11$F=P~ z(aV+j8S}m=CkI*8=RcrT>GmuYifP%hCoKY22Z4 zmu}o08h3YhcXx-v-QC??8mDn<+}+*X{+gZH-I;G^|7=1fBveS?J$27H&wV5^V^P$! z84?{UeYSmZ3M!@>UFoIN?GJT@IroYr;X@H~ax*CQ>b5|Xi9FXt5j`AwUPBq`0sWEJ z3O|k+g^JKMl}L(wfCqyMdRj9yS8ncE7nI14Tv#&(?}Q7oZpti{Q{Hw&5rN-&i|=fWH`XTQSu~1jx(hqm$Ibv zRzFW9$xf@oZAxL~wpj<0ZJ3rdPAE=0B>G+495QJ7D>=A&v^zXC9)2$$EnxQJ<^WlV zYKCHb1ZzzB!mBEW2WE|QG@&k?VXarY?umPPQ|kziS4{EqlIxqYHP!HN!ncw6BKQzKjqk!M&IiOJ9M^wc~ZQ1xoaI z;4je%ern~?qi&J?eD!vTl__*kd*nFF0n6mGEwI7%dI9rzCe~8vU1=nE&n4d&8}pdL zaz`QAY?6K@{s2x%Sx%#(y+t6qLw==>2(gb>AksEebXv=@ht>NBpqw=mkJR(c?l7vo z&cV)hxNoYPGqUh9KAKT)kc(NqekzE6(wjjotP(ac?`DJF=Sb7^Xet-A3PRl%n&zKk zruT9cS~vV1{%p>OVm1-miuKr<@rotj*5gd$?K`oteNibI&K?D63RoBjw)SommJ5<4 zus$!C8aCP{JHiFn2>XpX&l&jI7E7DcTjzuLYvON2{rz<)#$HNu(;ie-5$G<%eLKnTK7QXfn(UR(n+vX%aeS6!q6kv z!3nzY76-pdJp339zsl_%EI|;ic_m56({wdc(0C5LvLULW=&tWc5PW-4;&n+hm1m`f zzQV0T>OPSTjw=Ox&UF^y< zarsYKY8}YZF+~k70=olu$b$zdLaozBE|QE@H{_R21QlD5BilYBTOyv$D5DQZ8b1r- zIpSKX!SbA0Pb5#cT)L5!KpxX+x+8DRy&`o-nj+nmgV6-Gm%Fe91R1ca3`nt*hRS|^ z<&we;TJcUuPDqkM7k0S~cR%t7a`YP#80{BI$e=E!pY}am)2v3-Iqk2qvuAa1YM>xj#bh+H2V z{b#St2<;Gg>$orQ)c2a4AwD5iPcgZ7o_}7xhO86(JSJ(q(EWKTJDl|iBjGEMbX8|P z4PQHi+n(wZ_5QrX0?X_J)e_yGcTM#E#R^u_n8pK@l5416`c9S=q-e!%0RjoPyTliO zkp{OC@Ep^#Ig-n!C)K0Cy%8~**Vci8F1U(viN{==KU0nAg2(+K+GD_Gu#Bx!{tmUm zCwTrT(tCr6X8j43_n96H9%>>?4akSGMvgd+krS4wRexwZ1JxrJy!Uhz#yt$-=aq?A z@?*)bRZxjG9OF~7d$J0cwE_^CLceRK=LvjfH-~{S><^D;6B2&p-02?cl?|$@>`Qt$ zP*iaOxg<+(rbk>34VQDQpNQ|a9*)wScu!}<{oXC87hRPqyrNWpo?#=;1%^D2n2+C* zKKQH;?rWn-@%Y9g%NHG&lHwK9pBfV1a`!TqeU_Fv8s6_(@=RHua7`VYO|!W&WL*x= zIWE9eQaPq3zMaXuf)D0$V`RIZ74f)0P73xpeyk4)-?8j;|K%pD$eq4j2%tL=;&+E91O(2p91K|85b)GQcbRe&u6Ilu@SnE={^{Ix1Eqgv8D z4=w65+&36|;5WhBm$!n*!)ACCwT9Sip#1_z&g~E1kB=AlEhO0lu`Ls@6gw*a)lzc# zKx!fFP%eSBBs)U>xIcQKF(r_$SWD3TD@^^2Ylm=kC*tR+I@X>&SoPZdJ2fT!ysjH% z-U%|SznY8Fhsq7Vau%{Ad^Pvbf3IqVk{M2oD+w>MWimJA@VSZC$QooAO3 zC=DplXdkyl>mSp^$zk7&2+eoGQ6VVh_^E#Z3>tX7Dmi<2aqlM&YBmK&U}m>a%8)LQ z8v+c}a0QtXmyd%Kc2QNGf8TK?_EK4wtRUQ*VDnf5jHa?VvH2K(FDZOjAqYufW8oIZ z31|o~MR~T;ZS!Lz%8M0*iVARJ>_G2BXEF8(}6Dmn_rFV~5NI`lJjp`Mi~g7~P%H zO`S&-)Fngo3VXDMo7ImlaZxY^s!>2|csKca6!|m7)l^M0SQT1_L~K29%x4KV8*xiu zwP=GlyIE9YPSTC0BV`6|#)30=hJ~^aYeq7d6TNfoYUkk-^k0!(3qp(7Mo-$|48d8Z2d zrsfsRM)y$5)0G`fNq!V?qQ+nh0xwFbcp{nhW%vZ?h);=LxvM(pWd9FG$Bg1;@Bv)mKDW>AP{ol zD(R~mLzdDrBv$OSi{E%OD`Ano=F^vwc)rNb*Bg3-o)bbAgYE=M7Gj2OHY{8#pM${_^ zwkU|tnTKawxUF7vqM9UfcQ`V49zg78V%W)$#5ssR}Rj7E&p(4_ib^?9luZPJ%iJTvW&-U$nFYky>KJwHpEHHx zVEC;!ETdkCnO|${Vj#CY>LLut_+c|(hpWk8HRgMGRY%E--%oKh@{KnbQ~0GZd}{b@ z`J2qHBcqqjfHk^q=uQL!>6HSSF3LXL*cCd%opM|k#=xTShX~qcxpHTW*BI!c3`)hQq{@!7^mdUaG7sFsFYnl1%blslM;?B8Q zuifKqUAmR=>33g~#>EMNfdye#rz@IHgpM$~Z7c5@bO@S>MyFE3_F}HVNLnG0TjtXU zJeRWH^j5w_qXb$IGs+E>daTa}XPtrUnnpTRO9NEx4g6uaFEfHP9gW;xZnJi{oqAH~ z5dHS(ch3^hbvkv@u3QPLuWa}ImaElDrmIc%5HN<^bwej}3+?g) z-ai7D&6Iq_P(}k`i^4l?hRLbCb>X9iq2UYMl=`9U9Rf=3Y!gnJbr?eJqy>Zpp)m>Ae zcQ4Qfs&AaE?UDTODcEj#$_n4KeERZHx-I+E5I~E#L_T3WI3cj$5EYR75H7hy%80a8Ej?Y6hv+fR6wHN%_0$-xL!eI}fdjOK7(GdFD%`f%-qY@-i@fTAS&ETI99jUVg8 zslPSl#d4zbOcrgvopvB2c2A6r^pEr&Sa5I5%@1~BpGq`Wo|x=&)WnnQjE+)$^U-wW zr2Kv?XJby(8fcn z8JgPn)2_#-OhZ+;72R6PspMfCVvtLxFHeb7d}fo(GRjm_+R(*?9QRBr+yPF(iPO~ zA4Tp1<0}#fa{v0CU6jz}q9;!3Pew>ikG1qh$5WPRTQZ~ExQH}b1hDuzRS1}65uydS z~Te*3@?o8fih=mZ`iI!hL5iv3?VUBLQv0X zLtu58MIE7Jbm?)NFUZuMN2_~eh_Sqq*56yIo!+d_zr@^c@UwR&*j!fati$W<=rGGN zD$X`$lI%8Qe+KzBU*y3O+;f-Csr4$?3_l+uJ=K@dxOfZ?3APc5_x2R=a^kLFoxt*_ z4)nvvP+(zwlT5WYi!4l7+HKqzmXKYyM9kL5wX$dTSFSN&)*-&8Q{Q$K-})rWMin8S zy*5G*tRYNqk7&+v;@+>~EIQgf_SB;VxRTQFcm5VtqtKZ)x=?-f+%OY(VLrXb^6*aP zP&0Nu@~l2L!aF8i2!N~fJiHyxRl?I1QNjB)`uP_DuaU?2W;{?0#RGKTr2qH5QqdhK zP__ojm4WV^PUgmrV)`~f>(769t3|13DrzdDeXxqN6XA|_GK*;zHU()a(20>X{y-x| z2P6Ahq;o=)Nge`l+!+xEwY`7Q(8V=93A9C+WS^W%p&yR)eiSX+lp)?*7&WSYSh4i> zJa6i5T9o;Cd5z%%?FhB?J{l+t_)c&_f86gZMU{HpOA=-KoU5lIL#*&CZ_66O5$3?# ztgjGLo`Y7bj&eYnK#5x1trB_6tpu4$EomotZLb*9l6P(JmqG`{z$?lNKgq?GAVhkA zvw!oFhLyX=$K=jTAMwDQ)E-8ZW5$X%P2$YB5aq!VAnhwGv$VR&;Ix#fu%xlG{|j_K zbEYL&bx%*YpXcaGZj<{Y{k@rsrFKh7(|saspt?OxQ~oj_6En(&!rTZPa7fLCEU~mA zB7tbVs=-;cnzv*#INgF_9f3OZhp8c5yk!Dy1+`uA7@eJfvd~g34~wKI1PW%h(y&nA zRwMni12AHEw36)C4Tr-pt6s82EJa^8N#bjy??F*rg4fS@?6^MbiY3;7x=gd~G|Hi& zwmG+pAn!aV>>nNfP7-Zn8BLbJm&7}&ZX+$|z5*5{{F}BRSxN=JKZTa#{ut$v0Z0Fs za@UjXo#3!wACv+p9k*^9^n+(0(YKIUFo`@ib@bjz?Mh8*+V$`c%`Q>mrc5bs4aEf4 zh0qtL1qNE|xQ9JrM}qE>X>Y@dQ?%` zBx(*|1FMzVY&~|dE^}gHJ37O9bjnk$d8vKipgcf+As(kt2cbxAR3^4d0?`}}hYO*O z{+L&>G>AYaauAxE8=#F&u#1YGv%`d*v+EyDcU2TnqvRE33l1r}p#Vmcl%n>NrYOqV z2Car_^^NsZ&K=a~bj%SZlfxzHAxX$>=Q|Zi;E0oyfhgGgqe1Sd5-E$8KV9=`!3jWZCb2crb;rvQ##iw}xm7Da za!H${ls5Ihwxkh^D)M<4Yy3bp<-0a+&KfV@CVd9X6Q?v)$R3*rfT@jsedSEhoV(vqv?R1E8oWV;_{l_+_6= zLjV^-bZU$D_ocfSpRxDGk*J>n4G6s-e>D8JK6-gA>aM^Hv8@)txvKMi7Pi#DS5Y?r zK0%+L;QJdrIPXS2 ztjWAxkSwt2xG$L)Zb7F??cjs!KCTF+D{mZ5e0^8bdu_NLgFHTnO*wx!_8#}NO^mu{FaYeCXGjnUgt_+B-Ru!2_Ue-0UPg2Y)K3phLmR<4 zqUCWYX!KDU!jYF6c?k;;vF@Qh^q(PWwp1ez#I+0>d7V(u_h|L+kX+MN1f5WqMLn!L z!c(pozt7tRQi&duH8n=t-|d)c^;%K~6Kpyz(o53IQ_J+aCapAif$Ek#i0F9U>i+94 zFb=OH5(fk-o`L(o|DyQ(hlozl*2cu#)Y(D*zgNMi1Z!DTex#w#)x(8A-T=S+eByJW z%-k&|XhdZOWjJ&(FTrZNWRm^pHEot_MRQ_?>tKQ&MB~g(&D_e>-)u|`Ot(4j=UT6? zQ&YMi2UnCKlBpwltP!}8a2NJ`LlfL=k8SQf69U)~=G;bq9<2GU&Q#cHwL|o4?ah1` z;fG)%t0wMC;DR?^!jCoKib_iiIjsxCSxRUgJDCE%0P;4JZhJCy)vR1%zRl>K?V6#) z2lDi*W3q9rA zo;yvMujs+)a&00~W<-MNj=dJ@4%tccwT<@+c$#CPR%#aE#Dra+-5eSDl^E>is2v^~ z8lgRwkpeU$|1LW4yFwA{PQ^A{5JY!N5PCZ=hog~|FyPPK0-i;fCl4a%1 z?&@&E-)b4cK)wjXGq|?Kqv0s7y~xqvSj-NpOImt{Riam*Z!wz-coZIMuQU>M%6ben z>P@#o^W;fizVd#?`eeEPs#Gz^ySqJn+~`Pq%-Ee6*X+E>!PJGU#rs6qu0z5{+?`-N zxf1#+JNk7e6AoJTdQwxs&GMTq?Djch_8^xL^A;9XggtGL>!@0|BRuIdE&j$tzvt7I zr@I@0<0io%lpF697s1|qNS|BsA>!>-9DVlgGgw2;;k;=7)3+&t!);W3ulPgR>#JiV zUerO;WxuJqr$ghj-veVGfKF?O7si#mzX@GVt+F&atsB@NmBoV4dK|!owGP005$7LN7AqCG(S+={YA- zn#I{UoP_$~Epc=j78{(!2NLN)3qSm-1&{F&1z4Dz&7Mj_+SdlR^Q5{J=r822d4A@?Rj~xATaWewHUOus{*C|KoH`G zHB8SUT06GpSt)}cFJ18!$Kp@r+V3tE_L^^J%9$&fcyd_AHB)WBghwqBEWW!oh@StV zDrC?ttu4#?Aun!PhC4_KF1s2#kvIh~zds!y9#PIrnk9BWkJpq}{Hlqi+xPOR&A1oP zB0~1tV$Zt1pQuHpJw1TAOS=3$Jl&n{n!a+&SgYVe%igUtvE>eHqKY0`e5lwAf}2x( zP>9Wz+9uirp7<7kK0m2&Y*mzArUx%$CkV661=AIAS=V=|xY{;$B7cS5q0)=oq0uXU z_roo90&gHSfM6@6kmB_FJZ)3y_tt0}7#PA&pWo@_qzdIMRa-;U*Dy>Oo#S_n61Fn! z%mrH%tRmvQvg%UqN_2(C#LSxgQ>m}FKLGG=uqJQuSkk=S@c~QLi4N+>lr}QcOuP&% zQCP^cRk&rk-@lpa0^Lcvdu`F*qE)-0$TnxJlwZf|dP~s8cjhL%>^+L~{umxl5Xr6@ z^7zVKiN1Xg;-h+kr4Yt2BzjZs-Mo54`pDbLc}fWq{34=6>U9@sBP~iWZE`+FhtU|x zTV}ajn*Hc}Y?3agQ+bV@oIRm=qAu%|zE;hBw7kCcDx{pm!_qCxfPX3sh5^B$k_2d` z6#rAeUZC;e-LuMZ-f?gHeZogOa*mE>ffs+waQ+fQl4YKoAyZii_!O0;h55EMzD{;) z8lSJvv((#UqgJ?SCQFqJ-UU?2(0V{;7zT3TW`u6GH6h4m3}SuAAj_K(raGBu>|S&Q zZGL?r9@caTbmRm7p=&Tv?Y1)60*9At38w)$(1c?4cpFY2RLyw9c<{OwQE{b@WI}FQ zTT<2HOF4222d%k70yL~x_d#6SNz`*%@4++8gYQ8?yq0T@w~bF@aOHL2)T4xj`AVps9k z?m;<2ClJh$B6~fOYTWIV*T9y1BpB1*C?dgE{%lVtIjw>4MK{wP6OKTb znbPWrkZjYCbr`GGa%Xo0h;iFPNJBI3fK5`wtJV?wq_G<_PZ<`eiKtvN$IKfyju*^t zXc}HNg>^PPZ16m6bfTpmaW5=qoSsj>3)HS}teRa~qj+Y}mGRE?cH!qMDBJ8 zJB!&-=MG8Tb;V4cZjI_#{>ca0VhG_P=j0kcXVX5)^Sdpk+LKNv#yhpwC$k@v^Am&! z_cz2^4Cc{_BC!K#zN!KEkPzviUFPJ^N_L-kHG6}(X#$>Q=9?!{$A(=B3)P?PkxG9gs#l! zo6TOHo$F|IvjTC3MW%XrDoc7;m-6wb9mL(^2(>PQXY53hE?%4FW$rTHtN`!VgH72U zRY)#?Y*pMA<)x3B-&fgWQ(TQ6S6nUeSY{9)XOo_k=j$<*mA=f+ghSALYwBw~!Egn!jtjubOh?6Cb-Zi3IYn*fYl()^3u zRiX0I{5QaNPJ9w{yh4(o#$geO7b5lSh<5ZaRg9_=aFdZjxjXv(_SCv^v-{ZKQFtAA}kw=GPC7l81GY zeP@0Da{aR#{6`lbI0ON0y#K=t|L*}MG_HSl$e{U;v=BSs{SU3(e*qa(l%rD;(zM^3 zrRgN3M#Sf(Cr9>v{FtB`8JBK?_zO+~{H_0$lLA!l{YOs9KQd4Zt<3*Ns7dVbT{1Ut z?N9{XkN(96?r(4BH~3qeiJ_CAt+h1}O_4IUF$S(5EyTyo=`{^16P z=VhDY!NxkDukQz>T`0*H=(D3G7Np*2P`s(6M*(*ZJa;?@JYj&_z`d5bap=KK37p3I zr5#`%aC)7fUo#;*X5k7g&gQjxlC9CF{0dz*m2&+mf$Sc1LnyXn9lpZ!!Bl!@hnsE5px};b-b-`qne0Kh;hziNC zXV|zH%+PE!2@-IrIq!HM2+ld;VyNUZiDc@Tjt|-1&kq}>muY;TA3#Oy zWdYGP3NOZWSWtx6?S6ES@>)_Yz%%nLG3P>Z7`SrhkZ?shTfrHkYI;2zAn8h65wV3r z^{4izW-c9!MTge3eN=~r5aTnz6*6l#sD68kJ7Nv2wMbL~Ojj0H;M`mAvk*`Q!`KI? z7nCYBqbu$@MSNd+O&_oWdX()8Eh|Z&v&dJPg*o-sOBb2hriny)< zd(o&&kZM^NDtV=hufp8L zCkKu7)k`+czHaAU567$?GPRGdkb4$37zlIuS&<&1pgArURzoWCbyTEl9OiXZBn4p<$48-Gekh7>e)v*?{9xBt z=|Rx!@Y3N@ffW5*5!bio$jhJ7&{!B&SkAaN`w+&3x|D^o@s{ZAuqNss8K;211tUWIi1B!%-ViYX+Ys6w)Q z^o1{V=hK#+tt&aC(g+^bt-J9zNRdv>ZYm9KV^L0y-yoY7QVZJ_ivBS02I|mGD2;9c zR%+KD&jdXjPiUv#t1VmFOM&=OUE2`SNm4jm&a<;ZH`cYqBZoAglCyixC?+I+}*ScG#;?SEAFob{v0ZKw{`zw*tX}<2k zoH(fNh!>b5w8SWSV}rQ*E24cO=_eQHWy8J!5;Y>Bh|p;|nWH|nK9+ol$k`A*u*Y^Uz^%|h4Owu}Cb$zhIxlVJ8XJ0xtrErT zcK;34CB;ohd|^NfmVIF=XlmB5raI}nXjFz;ObQ4Mpl_`$dUe7sj!P3_WIC~I`_Xy@ z>P5*QE{RSPpuV=3z4p3}dh>Dp0=We@fdaF{sJ|+_E*#jyaTrj-6Y!GfD@#y@DUa;& zu4Iqw5(5AamgF!2SI&WT$rvChhIB$RFFF|W6A>(L9XT{0%DM{L`knIQPC$4F`8FWb zGlem_>>JK-Fib;g*xd<-9^&_ue95grYH>5OvTiM;#uT^LVmNXM-n8chJBD2KeDV7t zbnv3CaiyN>w(HfGv86K5MEM{?f#BTR7**smpNZ}ftm+gafRSt=6fN$(&?#6m3hF!>e$X)hFyCF++Qvx(<~q3esTI zH#8Sv!WIl2<&~=B)#sz1x2=+KTHj=0v&}iAi8eD=M->H|a@Qm|CSSzH#eVIR3_Tvu zG8S**NFbz%*X?DbDuP(oNv2;Lo@#_y4k$W+r^#TtJ8NyL&&Rk;@Q}~24`BB)bgwcp z=a^r(K_NEukZ*|*7c2JKrm&h&NP)9<($f)eTN}3|Rt`$5uB0|!$Xr4Vn#i;muSljn zxG?zbRD(M6+8MzGhbOn%C`M#OcRK!&ZHihwl{F+OAnR>cyg~No44>vliu$8^T!>>*vYQJCJg=EF^lJ*3M^=nGCw`Yg@hCmP(Gq^=eCEE1!t-2>%Al{w@*c% zUK{maww*>K$tu;~I@ERb9*uU@LsIJ|&@qcb!&b zsWIvDo4#9Qbvc#IS%sV1_4>^`newSxEcE08c9?rHY2%TRJfK2}-I=Fq-C)jc`gzV( zCn?^noD(9pAf2MP$>ur0;da`>Hr>o>N@8M;X@&mkf;%2A*2CmQBXirsJLY zlX21ma}mKH_LgYUM-->;tt;6F?E5=fUWDwQhp*drQ%hH0<5t2m)rFP%=6aPIC0j$R znGI0hcV~}vk?^&G`v~YCKc7#DrdMM3TcPBmxx#XUC_JVEt@k=%3-+7<3*fTcQ>f~?TdLjv96nb66xj=wVQfpuCD(?kzs~dUV<}P+Fpd)BOTO^<*E#H zeE80(b~h<*Qgez(iFFOkl!G!6#9NZAnsxghe$L=Twi^(Q&48 zD0ohTj)kGLD){xu%pm|}f#ZaFPYpHtg!HB30>F1c=cP)RqzK2co`01O5qwAP zUJm0jS0#mci>|Nu4#MF@u-%-4t>oUTnn_#3K09Hrwnw13HO@9L;wFJ*Z@=gCgpA@p zMswqk;)PTXWuMC-^MQxyNu8_G-i3W9!MLd2>;cM+;Hf&w| zLv{p*hArp9+h2wsMqT5WVqkkc0>1uokMox{AgAvDG^YJebD-czexMB!lJKWllLoBI zetW2;;FKI1xNtA(ZWys!_un~+834+6y|uV&Lo%dKwhcoDzRADYM*peh{o`-tHvwWIBIXW`PKwS3|M>CW37Z2dr!uJWNFS5UwY4;I zNIy1^sr+@8Fob%DHRNa&G{lm?KWU7sV2x9(Ft5?QKsLXi!v6@n&Iyaz5&U*|hCz+d z9vu60IG<v6+^ZmBs_aN!}p|{f(ikVl&LcB+UY;PPz* zj84Tm>g5~-X=GF_4JrVmtEtm=3mMEL1#z+pc~t^Iify^ft~cE=R0TymXu*iQL+XLX zdSK$~5pglr3f@Lrcp`>==b5Z6r7c=p=@A5nXNacsPfr(5m;~ks@*Wu7A z%WyY$Pt*RAKHz_7cghHuQqdU>hq$vD?plol_1EU(Fkgyo&Q2&2e?FT3;H%!|bhU~D z>VX4-6}JLQz8g3%Bq}n^NhfJur~v5H0dbB^$~+7lY{f3ES}E?|JnoLsAG%l^%eu_PM zEl0W(sbMRB3rFeYG&tR~(i2J0)RjngE`N_Jvxx!UAA1mc7J>9)`c=`}4bVbm8&{A` z3sMPU-!r-8de=P(C@7-{GgB<5I%)x{WfzJwEvG#hn3ict8@mexdoTz*(XX!C&~}L* z^%3eYQ8{Smsmq(GIM4d5ilDUk{t@2@*-aevxhy7yk(wH?8yFz%gOAXRbCYzm)=AsM z?~+vo2;{-jkA%Pqwq&co;|m{=y}y2lN$QPK>G_+jP`&?U&Ubq~T`BzAj1TlC`%8+$ zzdwNf<3suPnbh&`AI7RAYuQ<#!sD|A=ky2?hca{uHsB|0VqShI1G3lG5g}9~WSvy4 zX3p~Us^f5AfXlBZ0hA;mR6aj~Q8yb^QDaS*LFQwg!!<|W!%WX9Yu}HThc7>oC9##H zEW`}UQ%JQ38UdsxEUBrA@=6R-v1P6IoIw8$8fw6F{OSC7`cOr*u?p_0*Jvj|S)1cd z-9T);F8F-Y_*+h-Yt9cQQq{E|y^b@r&6=Cd9j0EZL}Pj*RdyxgJentY49AyC@PM<< zl&*aq_ubX%*pqUkQ^Zsi@DqhIeR&Ad)slJ2g zmeo&+(g!tg$z1ao1a#Qq1J022mH4}y?AvWboI4H028;trScqDQrB36t!gs|uZS9}KG0}DD$ zf2xF}M*@VJSzEJ5>ucf+L_AtN-Ht=34g&C?oPP>W^bwoigIncKUyf61!ce!2zpcNT zj&;rPGI~q2!Sy>Q7_lRX*DoIs-1Cei=Cd=+Xv4=%bn#Yqo@C=V`|QwlF0Y- zONtrwpHQ##4}VCL-1ol(e<~KU9-ja^kryz!g!})y-2S5z2^gE$Isj8l{%tF=Rzy`r z^RcP7vu`jHgHLKUE957n3j+BeE(bf;f)Zw($XaU6rZ26Upl#Yv28=8Y`hew{MbH>* z-sGI6dnb5D&dUCUBS`NLAIBP!Vi!2+~=AU+)^X^IpOEAn#+ab=`7c z%7B|mZ>wU+L;^&abXKan&N)O;=XI#dTV|9OMYxYqLbtT#GY8PP$45Rm2~of+J>>HIKIVn(uQf-rp09_MwOVIp@6!8bKV(C#(KxcW z;Pesq(wSafCc>iJNV8sg&`!g&G55<06{_1pIoL`2<7hPvAzR1+>H6Rx0Ra%4j7H-<-fnivydlm{TBr06;J-Bq8GdE^Amo)ptV>kS!Kyp*`wUx=K@{3cGZnz53`+C zLco1jxLkLNgbEdU)pRKB#Pq(#(Jt>)Yh8M?j^w&RPUueC)X(6`@@2R~PV@G(8xPwO z^B8^+`qZnQr$8AJ7<06J**+T8xIs)XCV6E_3W+al18!ycMqCfV>=rW0KBRjC* zuJkvrv;t&xBpl?OB3+Li(vQsS(-TPZ)Pw2>s8(3eF3=n*i0uqv@RM^T#Ql7(Em{(~%f2Fw|Reg@eSCey~P zBQlW)_DioA*yxxDcER@_=C1MC{UswPMLr5BQ~T6AcRyt0W44ffJG#T~Fk}wU^aYoF zYTayu-s?)<`2H(w+1(6X&I4?m3&8sok^jpXBB<|ZENso#?v@R1^DdVvKoD?}3%@{}}_E7;wt9USgrfR3(wabPRhJ{#1es81yP!o4)n~CGsh2_Yj2F^z|t zk((i&%nDLA%4KFdG96pQR26W>R2^?C1X4+a*hIzL$L=n4M7r$NOTQEo+k|2~SUI{XL{ynLSCPe%gWMMPFLO{&VN2pom zBUCQ(30qj=YtD_6H0-ZrJ46~YY*A;?tmaGvHvS^H&FXUG4)%-a1K~ly6LYaIn+4lG zt=wuGLw!%h=Pyz?TP=?6O-K-sT4W%_|Nl~;k~YA^_`gqfe{Xw=PWn#9f1mNz)sFuL zJbrevo(DPgpirvGMb6ByuEPd=Rgn}fYXqeUKyM+!n(cKeo|IY%p!#va6`D8?A*{u3 zEeWw0*oylJ1X!L#OCKktX2|>-z3#>`9xr~azOH+2dXHRwdfnpri9|xmK^Q~AuY!Fg z`9Xx?hxkJge~)NVkPQ(VaW(Ce2pXEtgY*cL8i4E)mM(iz_vdm|f@%cSb*Lw{WbShh41VGuplex9E^VvW}irx|;_{VK=N_WF39^ zH4<*peWzgc)0UQi4fBk2{FEzldDh5+KlRd!$_*@eYRMMRb1gU~9lSO_>Vh-~q|NTD zL}X*~hgMj$*Gp5AEs~>Bbjjq7G>}>ki1VxA>@kIhLe+(EQS0mjNEP&eXs5)I;7m1a zmK0Ly*!d~Dk4uxRIO%iZ!1-ztZxOG#W!Q_$M7_DKND0OwI+uC;PQCbQ#k#Y=^zQve zTZVepdX>5{JSJb;DX3%3g42Wz2D@%rhIhLBaFmx#ZV8mhya}jo1u{t^tzoiQy=jJp zjY2b7D2f$ZzJx)8fknqdD6fd5-iF8e(V}(@xe)N=fvS%{X$BRvW!N3TS8jn=P%;5j zShSbzsLs3uqycFi3=iSvqH~}bQn1WQGOL4?trj(kl?+q2R23I42!ipQ&`I*&?G#i9 zWvNh8xoGKDt>%@i0+}j?Ykw&_2C4!aYEW0^7)h2Hi7$;qgF3;Go?bs=v)kHmvd|`R z%(n94LdfxxZ)zh$ET8dH1F&J#O5&IcPH3=8o;%>OIT6w$P1Yz4S!}kJHNhMQ1(prc zM-jSA-7Iq=PiqxKSWb+YbLB-)lSkD6=!`4VL~`ExISOh2ud=TI&SKfR4J08Bad&rj zcXxMpcNgOB?w$~L7l^wPcXxw$0=$oV?)`I44)}b#ChS`_lBQhvb6ks?HDr3tFgkg&td19?b8=!sETXtp=&+3T$cCwZe z0nAET-7561gsbBws$TVjP7QxY(NuBYXVn9~9%vyN-B#&tJhWgtL1B<%BTS*-2$xB` zO)cMDHoWsm%JACZF--Pa7oP;f!n%p`*trlpvZ!HKoB={l+-(8O;;eYv2A=ra z3U7rSMCkP_6wAy`l|Se(&5|AefXvV1E#XA(LT!% zjj4|~xlZ-kPLNeQLFyXb%$K}YEfCBvHA-Znw#dZSI6V%3YD{Wj2@utT5Hieyofp6Qi+lz!u)htnI1GWzvQsA)baEuw9|+&(E@p8M+#&fsX@Kf`_YQ>VM+40YLv`3-(!Z7HKYg@+l00WGr779i-%t`kid%e zDtbh8UfBVT3|=8FrNian@aR3*DTUy&u&05x%(Lm3yNoBZXMHWS7OjdqHp>cD>g!wK z#~R{1`%v$IP;rBoP0B0P><;dxN9Xr+fp*s_EK3{EZ94{AV0#Mtv?;$1YaAdEiq5)g zYME;XN9cZs$;*2p63Q9^x&>PaA1p^5m7|W?hrXp2^m;B@xg0bD?J;wIbm6O~Nq^^K z2AYQs@7k)L#tgUkTOUHsh&*6b*EjYmwngU}qesKYPWxU-z_D> zDWr|K)XLf_3#k_9Rd;(@=P^S^?Wqlwert#9(A$*Y$s-Hy)BA0U0+Y58zs~h=YtDKxY0~BO^0&9{?6Nny;3=l59(6ec9j(79M?P1cE zex!T%$Ta-KhjFZLHjmPl_D=NhJULC}i$}9Qt?nm6K6-i8&X_P+i(c*LI3mtl3 z*B+F+7pnAZ5}UU_eImDj(et;Khf-z^4uHwrA7dwAm-e4 zwP1$Ov3NP5ts+e(SvM)u!3aZMuFQq@KE-W;K6 zag=H~vzsua&4Sb$4ja>&cSJ)jjVebuj+?ivYqrwp3!5>ul`B*4hJGrF;!`FaE+wKo z#};5)euvxC1zX0-G;AV@R(ZMl=q_~u8mQ5OYl;@BAkt)~#PynFX#c1K zUQ1^_N8g+IZwUl*n0Bb-vvliVtM=zuMGU-4a8|_8f|2GEd(2zSV?aSHUN9X^GDA8M zgTZW06m*iAy@7l>F3!7+_Y3mj^vjBsAux3$%U#d$BT^fTf-7{Y z_W0l=7$ro5IDt7jp;^cWh^Zl3Ga1qFNrprdu#g=n9=KH!CjLF#ucU5gy6*uASO~|b z7gcqm90K@rqe({P>;ww_q%4}@bq`ST8!0{V08YXY)5&V!>Td)?j7#K}HVaN4FU4DZ z%|7OppQq-h`HJ;rw-BAfH* z1H$ufM~W{%+b@9NK?RAp-$(P0N=b<(;wFbBN0{u5vc+>aoZ|3&^a866X@el7E8!E7 z=9V(Ma**m_{DKZit2k;ZOINI~E$|wO99by=HO{GNc1t?nl8soP@gxk8)WfxhIoxTP zoO`RA0VCaq)&iRDN9yh_@|zqF+f07Esbhe!e-j$^PS57%mq2p=+C%0KiwV#t^%_hH zoO?{^_yk5x~S)haR6akK6d|#2TN& zfWcN zc7QAWl)E9`!KlY>7^DNw$=yYmmRto>w0L(~fe?|n6k2TBsyG@sI)goigj=mn)E)I* z4_AGyEL7?(_+2z=1N@D}9$7FYdTu;%MFGP_mEJXc2OuXEcY1-$fpt8m_r2B|<~Xfs zX@3RQi`E-1}^9N{$(|YS@#{ZWuCxo)91{k>ESD54g_LYhm~vlOK_CAJHeYFfuIVB^%cqCfvpy#sU8Do8u}# z>>%PLKOZ^+$H54o@brtL-hHorSKcsjk_ZibBKBgyHt~L z=T6?e0oLX|h!Z3lbkPMO27MM?xn|uZAJwvmX?Yvp#lE3sQFY)xqet>`S2Y@1t)Z*& z;*I3;Ha8DFhk=YBt~{zp=%%*fEC}_8?9=(-k7HfFeN^GrhNw4e?vx*#oMztnO*&zY zmRT9dGI@O)t^=Wj&Og1R3b%(m*kb&yc;i`^-tqY9(0t!eyOkH<$@~1lXmm!SJllE_ zr~{a&w|8*LI>Z^h!m%YLgKv06Js7j7RaoX}ZJGYirR<#4Mghd{#;38j3|V+&=ZUq#1$ zgZb-7kV)WJUko?{R`hpSrC;w2{qa`(Z4gM5*ZL`|#8szO=PV^vpSI-^K_*OQji^J2 zZ_1142N}zG$1E0fI%uqHOhV+7%Tp{9$bAR=kRRs4{0a`r%o%$;vu!_Xgv;go)3!B#;hC5qD-bcUrKR&Sc%Zb1Y($r78T z=eG`X#IpBzmXm(o6NVmZdCQf6wzqawqI63v@e%3TKuF!cQ#NQbZ^?6K-3`_b=?ztW zA>^?F#dvVH=H-r3;;5%6hTN_KVZ=ps4^YtRk>P1i>uLZ)Ii2G7V5vy;OJ0}0!g>j^ z&TY&E2!|BDIf1}U(+4G5L~X6sQ_e7In0qJmWYpn!5j|2V{1zhjZt9cdKm!we6|Pp$ z07E+C8=tOwF<<}11VgVMzV8tCg+cD_z?u+$sBjwPXl^(Ge7y8-=c=fgNg@FxI1i5Y-HYQMEH z_($je;nw`Otdhd1G{Vn*w*u@j8&T=xnL;X?H6;{=WaFY+NJfB2(xN`G)LW?4u39;x z6?eSh3Wc@LR&yA2tJj;0{+h6rxF zKyHo}N}@004HA(adG~0solJ(7>?LoXKoH0~bm+xItnZ;3)VJt!?ue|~2C=ylHbPP7 zv2{DH()FXXS_ho-sbto)gk|2V#;BThoE}b1EkNYGT8U#0ItdHG>vOZx8JYN*5jUh5Fdr9#12^ zsEyffqFEQD(u&76zA^9Jklbiz#S|o1EET$ujLJAVDYF znX&4%;vPm-rT<8fDutDIPC@L=zskw49`G%}q#l$1G3atT(w70lgCyfYkg7-=+r7$%E`G?1NjiH)MvnKMWo-ivPSQHbk&_l5tedNp|3NbU^wk0SSXF9ohtM zUqXiOg*8ERKx{wO%BimK)=g^?w=pxB1Vu_x<9jKOcU7N;(!o3~UxyO+*ZCw|jy2}V*Z22~KhmvxoTszc+#EMWXTM6QF*ks% zW47#2B~?wS)6>_ciKe1Fu!@Tc6oN7e+6nriSU;qT7}f@DJiDF@P2jXUv|o|Wh1QPf zLG31d>@CpThA+Ex#y)ny8wkC4x-ELYCXGm1rFI=1C4`I5qboYgDf322B_Nk@#eMZ% znluCKW2GZ{r9HR@VY`>sNgy~s+D_GkqFyz6jgXKD)U|*eKBkJRRIz{gm3tUd*yXmR z(O4&#ZA*us6!^O*TzpKAZ#}B5@}?f=vdnqnRmG}xyt=)2o%<9jj>-4wLP1X-bI{(n zD9#|rN#J;G%LJ&$+Gl2eTRPx6BQC6Uc~YK?nMmktvy^E8#Y*6ZJVZ>Y(cgsVnd!tV z!%twMNznd)?}YCWyy1-#P|2Fu%~}hcTGoy>_uawRTVl=(xo5!%F#A38L109wyh@wm zdy+S8E_&$Gjm=7va-b7@Hv=*sNo0{i8B7=n4ex-mfg`$!n#)v@xxyQCr3m&O1Jxg! z+FXX^jtlw=utuQ+>Yj$`9!E<5-c!|FX(~q`mvt6i*K!L(MHaqZBTtuSA9V~V9Q$G? zC8wAV|#XY=;TQD#H;;dcHVb9I7Vu2nI0hHo)!_{qIa@|2}9d ztpC*Q{4Py~2;~6URN^4FBCBip`QDf|O_Y%iZyA0R`^MQf$ce0JuaV(_=YA`knEMXw zP6TbjYSGXi#B4eX=QiWqb3bEw-N*a;Yg?dsVPpeYFS*&AsqtW1j2D$h$*ZOdEb$8n0 zGET4Igs^cMTXWG{2#A7w_usx=KMmNfi4oAk8!MA8Y=Rh9^*r>jEV(-{I0=rc);`Y) zm+6KHz-;MIy|@2todN&F+Yv1e&b&ZvycbTHpDoZ>FIiUn+M-=%A2C(I*^Yx@VKf(Z zxJOny&WoWcyKodkeN^5))aV|-UBFw{?AGo?;NNFFcKzk+6|gYfA#FR=y@?;3IoQ zUMI=7lwo9gV9fRvYi}Nd)&gQw7(K3=a0#p27u6Q)7JlP#A)piUUF8B3Li&38Xk$@| z9OR+tU~qgd3T3322E))eV)hAAHYIj$TmhH#R+C-&E-}5Qd{3B}gD{MXnsrS;{Erv1 z6IyQ=S2qD>Weqqj#Pd65rDSdK54%boN+a?=CkR|agnIP6;INm0A*4gF;G4PlA^3%b zN{H%#wYu|!3fl*UL1~f+Iu|;cqDax?DBkZWSUQodSDL4Es@u6zA>sIm>^Aq-&X#X8 zI=#-ucD|iAodfOIY4AaBL$cFO@s(xJ#&_@ZbtU+jjSAW^g;_w`FK%aH_hAY=!MTjI zwh_OEJ_25zTQv$#9&u0A11x_cGd92E74AbOrD`~f6Ir9ENNQAV2_J2Ig~mHWhaO5a zc>fYG$zke^S+fBupw+klDkiljJAha z6DnTemhkf>hv`8J*W_#wBj-2w(cVtXbkWWtE(3j@!A-IfF?`r$MhVknTs3D1N`rYN zKth9jZtX#>v#%U@^DVN!;ni#n1)U&H_uB{6pcq7$TqXJX!Q0P7U*JUZyclb~)l*DS zOLpoQfW_3;a0S$#V0SOwVeeqE$Hd^L`$;l_~2giLYd?7!gUYIpOs!jqSL~pI)4`YuB_692~A z^T#YYQ_W3Rakk}$SL&{`H8mc{>j+3eKprw6BK`$vSSIn;s31M~YlJLApJ)+Gi1{^- zw96WnT9M0Vr_D=e=a}${raR{(35Q!g+8`}vOFj1e&Or(_wp2U2aVQP0_jP57 z2(R4E(E$n!xl<}Zx38wO;27wuQ`P#_j!}L2 z2qr;As4D4n2X$-Jd_-!fsbu_D(64i;c4cJnP576x_>Q4WNushFwkBV!kVd(AYFXe{ zaqO5`Qfr!#ETmE(B;u_&FITotv~W}QYFCI!&ENKIb1p4fg*Yv1)EDMb==EjHHWM#{ zGMpqb2-LXdHB@D~pE3|+B392Gh4q)y9jBd$a^&cJM60VEUnLtHQD5i-X6PVF>9m_k zDvG3P(?CzdaIrC8s4cu~N9MEb!Tt(g*GK~gIp1Gyeaw3b7#YPx_1T6i zRi#pAMr~PJKe9P~I+ARa$a!K~)t(4LaVbjva1yd;b1Yz2$7MMc`aLmMl(a^DgN(u? zq2o9&Gif@Tq~Yq+qDfx^F*nCnpuPv%hRFc$I!p74*quLt^M}D_rwl10uMTr!)(*=7 zSC5ea@#;l(h87k4T4x)(o^#l76P-GYJA(pOa&F9YT=fS<*O{4agzba^dIrh0hjls<~APlIz9{ zgRY{OMv2s|`;VCoYVj?InYoq^QWuA&*VDyOn@pPvK8l~g#1~~MGVVvtLDt}>id_Z` zn(ihfL?Y}Y4YX335m*Xx(y+bbukchHrM zycIGp#1*K3$!(tgTsMD2VyUSg^yvCwB8*V~sACE(yq2!MS6f+gsxv^GR|Q7R_euYx z&X+@@H?_oQddGxJYS&ZG-9O(X+l{wcw;W7srpYjZZvanY(>Q1utSiyuuonkjh5J0q zGz6`&meSuxixIPt{UoHVupUbFKIA+3V5(?ijn}(C(v>=v?L*lJF8|yRjl-m#^|krg zLVbFV6+VkoEGNz6he;EkP!Z6|a@n8?yCzX9>FEzLnp21JpU0x!Qee}lwVKA})LZJq zlI|C??|;gZ8#fC3`gzDU%7R87KZyd)H__0c^T^$zo@TBKTP*i{)Gp3E0TZ}s3mKSY zix@atp^j#QnSc5K&LsU38#{lUdwj%xF zcx&l^?95uq9on1m*0gp$ruu||5MQo)XaN>|ngV5Jb#^wWH^5AdYcn_1>H~XtNwJd3 zd9&?orMSSuj=lhO?6)Ay7;gdU#E}pTBa5wFu`nejq##Xd71BHzH2XqLA5 zeLEo;9$}~u0pEu@(?hXB_l;{jQ=7m?~mwj-ME~Tw-OHPrR7K2Xq9eCNwQO$hR z3_A?=`FJctNXA#yQEorVoh{RWxJbdQga zU%K##XEPgy?E|K(=o#IPgnbk7E&5%J=VHube|2%!Qp}@LznjE%VQhJ?L(XJOmFVY~ zo-az+^5!Ck7Lo<7b~XC6JFk>17*_dY;=z!<0eSdFD2L?CSp_XB+?;N+(5;@=_Ss3& zXse>@sA7hpq;IAeIp3hTe9^$DVYf&?)={zc9*hZAV)|UgKoD!1w{UVo8D)Htwi8*P z%#NAn+8sd@b{h=O)dy9EGKbpyDtl@NBZw0}+Wd=@65JyQ2QgU}q2ii;ot1OsAj zUI&+Pz+NvuRv#8ugesT<<@l4L$zso0AQMh{we$tkeG*mpLmOTiy8|dNYhsqhp+q*yfZA`Z)UC*(oxTNPfOFk3RXkbzAEPofVUy zZ3A%mO?WyTRh@WdXz+zD!ogo}gbUMV!YtTNhr zrt@3PcP%5F;_SQ>Ui`Gq-lUe&taU4*h2)6RDh@8G1$o!){k~3)DT87%tQeHYdO?B` zAmoJvG6wWS?=0(Cj?Aqj59`p(SIEvYyPGJ^reI z`Hr?3#U2zI7k0=UmqMD35l`>3xMcWlDv$oo6;b`dZq3d!~)W z=4Qk)lE8&>#HV>?kRLOHZYz83{u7?^KoXmM^pazj8`7OwQ=5I!==; zA!uN`Q#n=Drmzg}@^nG!mJp9ml3ukWk96^6*us*;&>s+7hWfLXtl?a}(|-#=P12>A zon1}yqh^?9!;on?tRd6Fk0knQSLl4vBGb87A_kJNDGyrnpmn48lz_%P{* z_G*3D#IR<2SS54L5^h*%=)4D9NPpji7DZ5&lHD|99W86QN_(|aJ<5C~PX%YB`Qt_W z>jF_Os@kI6R!ub4n-!orS(G6~mKL7()1g=Lf~{D!LR7#wRHfLxTjYr{*c{neyhz#U zbm@WBKozE+kTd+h-mgF+ELWqTKin57P;0b){ zii5=(B%S(N!Z=rAFGnM6iePtvpxB_Q9-oq_xH!URn2_d-H~i;lro8r{-g!k-Ydb6_w5K@FOV?zPF_hi z%rlxBv$lQi%bjsu^7KT~@u#*c$2-;AkuP)hVEN?W5MO8C9snj*EC&|M!aK6o12q3+ z8e?+dH17E!A$tRlbJW~GtMDkMPT=m1g-v67q{sznnWOI$`g(8E!Pf!#KpO?FETxLK z2b^8^@mE#AR1z(DT~R3!nnvq}LG2zDGoE1URR=A2SA z%lN$#V@#E&ip_KZL}Q6mvm(dsS?oHoRf8TWL~1)4^5<3JvvVbEsQqSa3(lF*_mA$g zv`LWarC79G)zR0J+#=6kB`SgjQZ2460W zN%lZt%M@=EN>Wz4I;eH>C0VnDyFe)DBS_2{h6=0ZJ*w%s)QFxLq+%L%e~UQ0mM9ud zm&|r){_<*Om%vlT(K9>dE(3AHjSYro5Y1I?ZjMqWyHzuCE0nyCn`6eq%MEt(aY=M2rIzHeMds)4^Aub^iTIT|%*izG4YH;sT`D9MR(eND-SB+e66LZT z2VX)RJsn${O{D48aUBl|(>ocol$1@glsxisc#GE*=DXHXA?|hJT#{;X{i$XibrA}X zFHJa+ssa2$F_UC(o2k2Z0vwx%Wb(<6_bdDO#=a$0gK2NoscCr;vyx?#cF)JjM%;a| z$^GIlIzvz%Hx3WVU481}_e4~aWcyC|j&BZ@uWW1`bH1y9EWXOxd~f-VE5DpueNofN zv7vZeV<*!A^|36hUE;`#x%MHhL(~?eZ5fhA9Ql3KHTWoAeO-^7&|2)$IcD1r5X#-u zN~N0$6pHPhop@t1_d`dO3#TC0>y5jm>8;$F5_A2& zt#=^IDfYv?JjPPTPNx2TL-Lrl82VClQSLWW_$3=XPbH}xM34)cyW5@lnxy=&h%eRq zv29&h^fMoxjsDnmua(>~OnX{Cq!7vM0M4Mr@_18|YuSKPBKUTV$s^So zc}JlAW&bVz|JY#Eyup6Ny{|P_s0Pq;5*tinH+>5Xa--{ z2;?2PBs((S4{g=G`S?B3Ien`o#5DmUVwzpGuABthYG~OKIY`2ms;33SN9u^I8i_H5`BQ%yOfW+N3r|ufHS_;U;TWT5z;b14n1gX%Pn`uuO z6#>Vl)L0*8yl|#mICWQUtgzeFp9$puHl~m&O+vj3Ox#SxQUa?fY*uK?A;00RiFg(G zK?g=7b5~U4QIK`C*um%=Sw=OJ1eeaV@WZ%hh-3<=lR#(Xesk%?)l4p(EpTwPvN99V@TT)!A8SeFTV+frN=r|5l?K#odjijx2nFgc3kI zC$hVs1S-!z9>xn9MZcRk0YXdYlf~8*LfH$IHKD59H&gLz%6 z#mAYSRJufbRi~LRadwM*G!O2>&U<^d`@<)otXZJJxT@G}4kTx0zPDVhVXwiU)$}5Y z`0iV`8EEh&GlUk&VY9m0Mqr*U&|^Bc?FB`<%{x-o0ATntwIA%(YDcxWs$C)%a%d_@ z?fx!Co+@3p7ha$|pWYD}p6#(PG%_h8K7sQjT_P~|3ZEH0DRxa3~bP&&lPMj3C~!H2QD zq>(f^RUFSqf6K3BMBFy$jiuoSE+DhEq$xLDb7{57 z0B|1pSjYJ5F@cHG%qDZ{ogL$P!BK&sR%zD`gbK#9gRZX17EtAJxN% zys^gb2=X9=7HP}N(iRqt(tot2yyeE%s;L}AcMh;~-W~s_eAe!gIUYdQz5j~T)0trh z>#1U$uOyyl%!Pi(gD&)uHe9Q^27_kHyFCC}n^-KL(=OxHqUfex1YS__RJh0m-S>eM zqAk`aSev*z1lI&-?CycgDm=bdQCp}RqS0_d-4Mf&>u2KyGFxKe8JM1N{GNWw0n$FL z1UDp(h0(1I2Jh9I`?IS}h4R~n zRwRz>8?$fFMB2{UPe^$Ifl;Oc>}@Q9`|8DCeR{?LUQLPfaMsxs8ps=D_aAXORZH~< zdcIOca-F;+D3~M+)Vi4h)I4O3<)$65yI)goQ_vk#fb;Uim>UI4Dv9#2b1;N_Wg>-F zNwKeMKY+su#~NL0uE%_$mw1%ddX2Qs2P!ncM+>wnz}OCQX1!q~oS?OqYU;&ESAAwP z452QWL0&u^mraF#=j_ZeBWhm&F|d!QjwRl^7=Bl7@(43=BkN=3{BRv#QHIk>Umc_w zvP>q|q{lJ=zs|W9%a@8%W>C@MYN1D5{(=Af31+pR#kB`cd0-YlQQTg}+ zL|_h=F9JQ|Gux5c0ehaffHNYLf8VwF+qnM6IjBEI_eceee;o;FY@#~FFVsZjBSp!j z8V*Bgmn{RK!!zqGc;jy)z@Zjo>5{%m1?K}fLEL$l6Dl4f=ye0wNI#)2L=^K(&18Gb zJoj8@WBB;P^T#V)I0`aDSy?$rJU{+-5472NyFp>;Vw43j@3Z=;D2eSfyw5*0Q+&ML zsV&&*3c3$pa`qcaGbEB0*CA~Wp3%PkF?B87FV&rWNb|@GU$LB;l|;YutU*k za1hjUL_BX%G^s;BuzRi4Hl?eqC2z&ZrKh1tZDwnufG$g$LX(j!h%F5(n8D@in3lnX z(*8+3ZT6TVYRcSpM1eMeCps=Fz8q%gyM&B=a7(Vf`4k3dN$IM+`BO^_7HZq4BR|7w z+5kOJ;9_$X%-~arA@qmXSzD|+NMh--%5-9u6t(M=f%&z$<_V#Y_lzn{E$MZZG)+A> zu2E`_Y(MBJ2l*AqvCUmU;yBT}#oQ{V=((mC-QGJwsCOH*a;{1JRTKv7DBNG+M!XL7(^jbv&Qy-o9HNFrmN)-`D3WFtXs>1vBOJpI(=x; zKhJlFdfMf^G#oU(w1+ucMKYPZaDp>$kt=wiYsBCjUY-uz<4JziB>6fXDSLH*2Y z&Px5y`#3!fF=c4>fCMdg-tX582pemU@ZxyFbznL8-=TTo1Sybg9>7h*J^9^~XxXJO z`k9v~=4amxl<;FCV9h2k%?^-ZUzQy^#{JleyH23o1S{r<+t#z6jKS<9rbAM96^1iY zi6{IjauB)UwBhC-_L(MzGCxhhv`?ryc zja_Uwi7$8l!}*vjJppGyp#Wz=*?;jC*xQ&J894rql5A$2giJRtV&DWQh#(+Vs3-5_ z69_tj(>8%z1VtVp>a74r5}j2rG%&;uaTQ|fr&r%ew-HO}76i8`&ki%#)~}q4Y|d$_ zfNp9uc#$#OEca>>MaY6rF`dB|5#S)bghf>>TmmE&S~IFw;PF0UztO6+R-0!TSC?QP z{b(RA_;q3QAPW^XN?qQqu{h<}Vfiv}Rr!lA$C79^1=U>+ng9Dh>v{`?AOZt>CrQ=o zI}=mSnR))8fJpO->rcX?H);oqSQUZ?sR!fH2SoFdcPm5*2y<_u;4h;BqcF*XbwWSv zcJN%!g|L(22Xp!^1?c;T&qm%rpkP&2EQC3JF+SENm$+@7#e!UKD1uQ{TDw43?!b!3 zUooS_rt=xJfa&h?c^hfV>YwQXre3qosz_^c#)FO~d!<)2o}Oxz5HWtr<)1Yw012v4 zhv0w(RfJspDnA^-6Jmr;GkWt%{mAYOm6yPb&Vl&rv@D^K&;#?=X{kaK5FhScNJ_3> z#5u(Saisq2(~pVlrfG#@kLM#Ot~5rZZc%B&h1=gen?R+#t^1bYKf zVvtefX=D$*)39e^2@!~A_}9c${Gf0?1;dk=!Itp#s%0>Io%k`9(bDeI-udd&E6Zfu zcaiv(h`DM3W3Mfda)fYwhB=8RAPkotVt5-z21Ij~Ot9A^SK-1u*zFVK&mF?q1;|wy zrF+XWs^5Q-%Z6I62gTwrRe#F>riVM#fv_TihxSJ6to1X7NVszgivoTa!fPfBBYj94 zuc2m zL_k-<1FoORng1aL{Zx(P7JmUiH zlmTHdzkn75=mS{V=o$V;gzhEaunoJzJ3uq>0_w~77eID^U*w+v0po_N8=sS-DL~!V z%-~rL<0V7PCEWPCpNgpfsein`Fr)+8=N}mUn2x=K`z%efnhSs#23&N1fjdO`M>s%z zP3(;v93%lLq>ZfqBi#QI-aCXAP8-may8x5s`G)KA;{HSYe2szWINWf^b*fc{jl0KecD zRTle?)%_YzJJcVb>;VJ>P?3Lu2S)vCJZlF>Jxj~~X2U5-NNNy(H?8%XD~yFUxNKs&hwWx^)iF@ zGmEv<|7Q7hGrY_+`iz+d_=^9c(_c}UCzq2#%A0|5WjzCXjZUOxOX zU&-^smw$iwKPe;r`&{rP{L35^&+wk6f2-Sn;D2Ww@sjAJj{Gwbp4H!o{#5_}qALFq z{-q%LGklZvKf%A4D!+t%sRRBDi(>mvuz&V4yu^GdD*KFy?fg%ef5ZU%w=d&M`POGt zNSEJ0{qJI~FRTAjlJc1-+x>Tm{%D?m3sk-&cq#w)OpxI98wCF#2KbWcrAXK_(}M4B zF#VQf*h|irx=+uXZUMi+`A;fPFR5M%Wjs^Wh5rWCKgedhWO^w|@XS;b^&3oom;>K0 zB??|ry^IBarYem6Z7RU`#rDs-ZZAn*hSollv?csD$sh0QpTtI9vb>Dpd}e7*`fZj! zM|8d{~YM@vfW-r0z8vJ z<^6B6Ur(}L?ms_c9@hO0^Iy&J_uc51^?d33e#Y!-``?)VG)BGjCq5$&0G8A*r!2qk zUHscGc;VxE=1KqbH=dW%&Ogl({>L!>((m$2W8M9KQ@a1=h51jN|KoG{v(x0K&*iy% e1c3cF4~(n?C}6GmGu)3JNC)6=LGAhZ*Z%`+-T+_# literal 0 HcmV?d00001 diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties new file mode 100755 index 0000000..37e7bf4 --- /dev/null +++ b/gradle/wrapper/gradle-wrapper.properties @@ -0,0 +1,6 @@ +#Thu Aug 02 10:12:04 CEST 2018 +distributionBase=GRADLE_USER_HOME +distributionPath=wrapper/dists +zipStoreBase=GRADLE_USER_HOME +zipStorePath=wrapper/dists +distributionUrl=https\://services.gradle.org/distributions/gradle-4.9-all.zip diff --git a/gradlew b/gradlew new file mode 100755 index 0000000..cccdd3d --- /dev/null +++ b/gradlew @@ -0,0 +1,172 @@ +#!/usr/bin/env sh + +############################################################################## +## +## Gradle start up script for UN*X +## +############################################################################## + +# Attempt to set APP_HOME +# Resolve links: $0 may be a link +PRG="$0" +# Need this for relative symlinks. +while [ -h "$PRG" ] ; do + ls=`ls -ld "$PRG"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '/.*' > /dev/null; then + PRG="$link" + else + PRG=`dirname "$PRG"`"/$link" + fi +done +SAVED="`pwd`" +cd "`dirname \"$PRG\"`/" >/dev/null +APP_HOME="`pwd -P`" +cd "$SAVED" >/dev/null + +APP_NAME="Gradle" +APP_BASE_NAME=`basename "$0"` + +# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +DEFAULT_JVM_OPTS="" + +# Use the maximum available, or set MAX_FD != -1 to use that value. +MAX_FD="maximum" + +warn () { + echo "$*" +} + +die () { + echo + echo "$*" + echo + exit 1 +} + +# OS specific support (must be 'true' or 'false'). +cygwin=false +msys=false +darwin=false +nonstop=false +case "`uname`" in + CYGWIN* ) + cygwin=true + ;; + Darwin* ) + darwin=true + ;; + MINGW* ) + msys=true + ;; + NONSTOP* ) + nonstop=true + ;; +esac + +CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar + +# Determine the Java command to use to start the JVM. +if [ -n "$JAVA_HOME" ] ; then + if [ -x "$JAVA_HOME/jre/sh/java" ] ; then + # IBM's JDK on AIX uses strange locations for the executables + JAVACMD="$JAVA_HOME/jre/sh/java" + else + JAVACMD="$JAVA_HOME/bin/java" + fi + if [ ! -x "$JAVACMD" ] ; then + die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." + fi +else + JAVACMD="java" + which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." +fi + +# Increase the maximum file descriptors if we can. +if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then + MAX_FD_LIMIT=`ulimit -H -n` + if [ $? -eq 0 ] ; then + if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then + MAX_FD="$MAX_FD_LIMIT" + fi + ulimit -n $MAX_FD + if [ $? -ne 0 ] ; then + warn "Could not set maximum file descriptor limit: $MAX_FD" + fi + else + warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" + fi +fi + +# For Darwin, add options to specify how the application appears in the dock +if $darwin; then + GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" +fi + +# For Cygwin, switch paths to Windows format before running java +if $cygwin ; then + APP_HOME=`cygpath --path --mixed "$APP_HOME"` + CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` + JAVACMD=`cygpath --unix "$JAVACMD"` + + # We build the pattern for arguments to be converted via cygpath + ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` + SEP="" + for dir in $ROOTDIRSRAW ; do + ROOTDIRS="$ROOTDIRS$SEP$dir" + SEP="|" + done + OURCYGPATTERN="(^($ROOTDIRS))" + # Add a user-defined pattern to the cygpath arguments + if [ "$GRADLE_CYGPATTERN" != "" ] ; then + OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" + fi + # Now convert the arguments - kludge to limit ourselves to /bin/sh + i=0 + for arg in "$@" ; do + CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` + CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option + + if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition + eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` + else + eval `echo args$i`="\"$arg\"" + fi + i=$((i+1)) + done + case $i in + (0) set -- ;; + (1) set -- "$args0" ;; + (2) set -- "$args0" "$args1" ;; + (3) set -- "$args0" "$args1" "$args2" ;; + (4) set -- "$args0" "$args1" "$args2" "$args3" ;; + (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; + (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; + (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; + (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; + (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; + esac +fi + +# Escape application args +save () { + for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done + echo " " +} +APP_ARGS=$(save "$@") + +# Collect all arguments for the java command, following the shell quoting and substitution rules +eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" + +# by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong +if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then + cd "$(dirname "$0")" +fi + +exec "$JAVACMD" "$@" diff --git a/gradlew.bat b/gradlew.bat new file mode 100755 index 0000000..f955316 --- /dev/null +++ b/gradlew.bat @@ -0,0 +1,84 @@ +@if "%DEBUG%" == "" @echo off +@rem ########################################################################## +@rem +@rem Gradle startup script for Windows +@rem +@rem ########################################################################## + +@rem Set local scope for the variables with windows NT shell +if "%OS%"=="Windows_NT" setlocal + +set DIRNAME=%~dp0 +if "%DIRNAME%" == "" set DIRNAME=. +set APP_BASE_NAME=%~n0 +set APP_HOME=%DIRNAME% + +@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +set DEFAULT_JVM_OPTS= + +@rem Find java.exe +if defined JAVA_HOME goto findJavaFromJavaHome + +set JAVA_EXE=java.exe +%JAVA_EXE% -version >NUL 2>&1 +if "%ERRORLEVEL%" == "0" goto init + +echo. +echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:findJavaFromJavaHome +set JAVA_HOME=%JAVA_HOME:"=% +set JAVA_EXE=%JAVA_HOME%/bin/java.exe + +if exist "%JAVA_EXE%" goto init + +echo. +echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:init +@rem Get command-line arguments, handling Windows variants + +if not "%OS%" == "Windows_NT" goto win9xME_args + +:win9xME_args +@rem Slurp the command line arguments. +set CMD_LINE_ARGS= +set _SKIP=2 + +:win9xME_args_slurp +if "x%~1" == "x" goto execute + +set CMD_LINE_ARGS=%* + +:execute +@rem Setup the command line + +set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar + +@rem Execute Gradle +"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% + +:end +@rem End local scope for the variables with windows NT shell +if "%ERRORLEVEL%"=="0" goto mainEnd + +:fail +rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of +rem the _cmd.exe /c_ return code! +if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 +exit /b 1 + +:mainEnd +if "%OS%"=="Windows_NT" endlocal + +:omega diff --git a/src/main/java/org/netspeak/Util.java b/src/main/java/org/netspeak/Util.java new file mode 100755 index 0000000..5b01a05 --- /dev/null +++ b/src/main/java/org/netspeak/Util.java @@ -0,0 +1,238 @@ +package org.netspeak; + +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.Reader; +import java.nio.charset.StandardCharsets; +import java.nio.file.FileVisitResult; +import java.nio.file.Files; +import java.nio.file.LinkOption; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.SimpleFileVisitor; +import java.nio.file.attribute.BasicFileAttributes; +import java.util.ArrayList; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; +import java.util.concurrent.ExecutionException; +import java.util.concurrent.Future; +import java.util.function.Consumer; +import java.util.function.Function; +import java.util.function.Supplier; +import java.util.regex.MatchResult; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import static java.util.Objects.requireNonNull; + +public final class Util { + + private Util() { + } + + /** + * Deletes the given file or directory. + *

+ * System links will not be followed. This will throw for non-empty directories. This operation will do nothing if + * the given path does not exist. + * + * @param dirOrFile + * @throws IOException + */ + public static void delete(Path dirOrFile) throws IOException { + delete(dirOrFile, false); + } + + /** + * Deletes the given file or directory (recursively). + *

+ * System links will not be followed. This will throw for non-empty directories if not recursive. This operation + * will do nothing if the given path does not exist. + * + * @param dirOrFile + * @throws IOException + */ + public static void delete(Path dirOrFile, boolean recursive) throws IOException { + if (!recursive) { + Files.deleteIfExists(dirOrFile); + } else { + if (!Files.exists(dirOrFile, LinkOption.NOFOLLOW_LINKS)) { + return; + } + Files.walkFileTree(dirOrFile, new SimpleFileVisitor() { + @Override + public FileVisitResult visitFile(Path file, BasicFileAttributes attrs) throws IOException { + Files.delete(file); + return FileVisitResult.CONTINUE; + } + + @Override + public FileVisitResult postVisitDirectory(Path dir, IOException exc) throws IOException { + Files.delete(dir); + return FileVisitResult.CONTINUE; + } + }); + } + } + + public static void createEmptyDirectory(Path dir) throws IOException { + requireNonNull(dir); + if (Files.isDirectory(dir)) { + if (Files.newDirectoryStream(dir).iterator().hasNext()) { + throw new AssertionError("Is not empty " + dir); + } + } else { + Files.createDirectories(dir); + } + } + + public static List getAll(Iterable> futures) throws InterruptedException, ExecutionException { + List values = new ArrayList<>(); + for (Future f : futures) { + values.add(f.get()); + } + return values; + } + + /** + * Returns the + * + * @param path + * @return + * @throws IOException + */ + public static Set readWordList(Path path) throws IOException { + try (FileInputStream fileIn = new FileInputStream(path.toFile()); + Reader in = new InputStreamReader(fileIn, StandardCharsets.UTF_8)) { + return readWordList(in); + } + } + + public static Set readWordList(Reader in) throws IOException { + try (BufferedReader bufferedReader = new BufferedReader(in)) { + Set set = new LinkedHashSet<>(); + + bufferedReader.lines().forEach(word -> { + if (word == null || word.isEmpty()) + return; + + word = word.trim(); + if (!word.isEmpty()) { + set.add(word); + } + }); + + return set; + } + } + + public static Set readWordList(String path) throws IOException { + return readWordList(Paths.get(path)); + } + + public static Set readResourceWordList(String name) throws IOException { + try (InputStream input = Util.class.getResourceAsStream(name); + Reader in = new InputStreamReader(input, StandardCharsets.UTF_8)) { + return readWordList(in); + } + } + + public static String toPhrase(String[] words) { + if (words.length == 1) + return words[0]; + + StringBuilder sb = new StringBuilder(); + sb.append(words[0]); + + for (int i = 1; i < words.length; i++) { + sb.append(' '); + sb.append(words[i]); + } + + return sb.toString(); + } + + + /** + * Replaces all occurrences of the given pattern in the given string with the string returned by the replacer + * function. + * + * @param pattern + * @param string + * @param replacer + * @return + */ + public static String replaceAll(Pattern pattern, String string, Function replacer) { + Matcher matcher = pattern.matcher(string); + + requireNonNull(replacer); + boolean result = matcher.find(); + if (result) { + StringBuilder sb = new StringBuilder(); + int last; + do { + String replacement = replacer.apply(matcher); + sb.append(replacement); + last = matcher.end(); + result = matcher.find(); + } while (result); + sb.append(string, last, string.length()); + return sb.toString(); + } + return string; + } + + + public interface ThrowsRunnable extends Runnable { + + void runThrowing() throws Exception; + + @Override + default void run() { + try { + runThrowing(); + } catch (RuntimeException e) { + throw e; + } catch (Exception e) { + throw new RuntimeException(e); + } + } + } + + public interface ThrowsConsumer extends Consumer { + + void acceptThrowing(T t) throws Exception; + + @Override + default void accept(T t) { + try { + acceptThrowing(t); + } catch (RuntimeException e) { + throw e; + } catch (Exception e) { + throw new RuntimeException(e); + } + } + } + + public interface ThrowsSupplier extends Supplier { + + T getThrowing() throws Exception; + + @Override + default T get() { + try { + return getThrowing(); + } catch (RuntimeException e) { + throw e; + } catch (Exception e) { + throw new RuntimeException(e); + } + } + } + +} diff --git a/src/main/java/org/netspeak/io/GoogleBooksCsvReader.java b/src/main/java/org/netspeak/io/GoogleBooksCsvReader.java new file mode 100644 index 0000000..1c15dee --- /dev/null +++ b/src/main/java/org/netspeak/io/GoogleBooksCsvReader.java @@ -0,0 +1,130 @@ +package org.netspeak.io; + +import java.io.BufferedReader; +import java.io.IOException; + +/** + * A reader for Google books CSV files. + *

+ * These files are a bit difficult to parse because the n-grams are also + * separated by years. So there will be many consecutive occurrences of the same + * phrase but for different years. This reader will automatically parse and + * aggregate these entries. + *

+ * Example: + * + *

+ * collision such    2000    4     4     4
+ * collision such    2001    6     6     6
+ * collision such    2002    6     6     6
+ * collision such    2003    10    11    0
+ * collision such    2004    17    11    5
+ * collision such    2005    14    11    3
+ * collision such    2006    20    22    0
+ * collision such    2007    17    11    7
+ * collision such    2008    19    11    8
+ * 
+ * + * These will all be parsed and aggregated into: + * + *
+ * 
+ * String phrase  = "collision such";
+ * long frequency = 113;
+ * 
+ * 
+ * + * @author Michael Schmidt + * + */ +public class GoogleBooksCsvReader implements PhraseReader { + + private final BufferedReader reader; + private String lastLine = null; + + public GoogleBooksCsvReader(BufferedReader reader) { + this.reader = reader; + } + + @Override + public PhraseFrequencyPair nextPair() throws IOException { + String line = lastLine == null ? reader.readLine() : lastLine; + + MutablePhraseFrequencyPair pair = new MutablePhraseFrequencyPair(null, -1); + while (line != null && !parseLine(line, pair)) { + // we read lines until we find one which parses or arrive at the end + line = reader.readLine(); + } + if (line == null || pair.phrase == null) + return null; + + // aggregate the frequencies of the next lines which also have the current + // phrase + String currentPhrase = pair.phrase; + long currentFrequency = pair.frequency; + + String nextLine; + while ((nextLine = reader.readLine()) != null) { + if (parseLine(nextLine, pair)) { + if (currentPhrase.contentEquals(pair.phrase)) { + currentFrequency += pair.frequency; + } else { + break; + } + } + } + lastLine = nextLine; + + return new PhraseFrequencyPair(currentPhrase, currentFrequency); + } + + /** + * This parses a CSV line. + *

+ * Returns {@code false} if the given line could not be parsed. + */ + private static boolean parseLine(String line, MutablePhraseFrequencyPair pair) { + // e.g. "circumvallate\t1978\t313\t215\t85" + // "The first line tells us that in 1978, the word "circumvallate" occurred 313 + // times overall, on 215 distinct pages and in 85 distinct books." + + // this operation will be done millions of times, so I want to avoid + // String#split + + int firstTab = line.indexOf('\t', 0); + int secondTab = line.indexOf('\t', firstTab + 1); + int thirdTab = line.indexOf('\t', secondTab + 1); + if (firstTab == -1 || secondTab == -1 || thirdTab == -1) + return false; + + // phrases sometimes have a trailing space, so we have to remove that + String phrase = line.substring(0, firstTab).trim(); + // the empty string is not a valid phrase + if (phrase.isEmpty()) { + return false; + } + + pair.phrase = phrase; + pair.frequency = Long.parseLong(line.substring(secondTab + 1, thirdTab)); + + return true; + } + + @Override + public void close() throws IOException { + reader.close(); + } + + private static class MutablePhraseFrequencyPair { + + public String phrase; + public long frequency; + + public MutablePhraseFrequencyPair(final String phrase, final long frequency) { + this.phrase = phrase; + this.frequency = frequency; + } + + } + +} diff --git a/src/main/java/org/netspeak/io/PhraseFrequencyPair.java b/src/main/java/org/netspeak/io/PhraseFrequencyPair.java new file mode 100644 index 0000000..a44ec25 --- /dev/null +++ b/src/main/java/org/netspeak/io/PhraseFrequencyPair.java @@ -0,0 +1,40 @@ +package org.netspeak.io; + +import static java.util.Objects.requireNonNull; + +public class PhraseFrequencyPair { + + public final String phrase; + public final long frequency; + + /** + * Creates a new phrase frequency pair. + * + * @param phrase + * @param frequency + * @throws NullPointerException if the given phrase is {@code null}. + * @throws IllegalArgumentException if the given frequency is {@code <= 0}. + */ + public PhraseFrequencyPair(final String phrase, final long frequency) { + if (frequency <= 0) { + throw new IllegalArgumentException(); + } + this.phrase = requireNonNull(phrase); + this.frequency = frequency; + } + + @Override + public boolean equals(Object obj) { + if (obj instanceof PhraseFrequencyPair) { + PhraseFrequencyPair other = (PhraseFrequencyPair) obj; + return this.phrase.contentEquals(other.phrase) && this.frequency == other.frequency; + } + return false; + } + + @Override + public int hashCode() { + return phrase.hashCode() ^ (int) frequency ^ (int) (frequency >>> 32); + } + +} diff --git a/src/main/java/org/netspeak/io/PhraseReader.java b/src/main/java/org/netspeak/io/PhraseReader.java new file mode 100644 index 0000000..ed8d31a --- /dev/null +++ b/src/main/java/org/netspeak/io/PhraseReader.java @@ -0,0 +1,22 @@ +package org.netspeak.io; + +/** + * A interface for readers which return one phrase-frequency-pair at a time. + * + * @see GoogleBooksCsvReader + * @see SimpleCsvReader + * + * @author Michael + */ +public interface PhraseReader extends AutoCloseable { + + /** + * Returns the next phrase-frequency-pair or {@code null} if no other pairs will + * be returned. + * + * @return + * @throws Exception + */ + PhraseFrequencyPair nextPair() throws Exception; + +} diff --git a/src/main/java/org/netspeak/io/PhraseWriter.java b/src/main/java/org/netspeak/io/PhraseWriter.java new file mode 100644 index 0000000..9052ee1 --- /dev/null +++ b/src/main/java/org/netspeak/io/PhraseWriter.java @@ -0,0 +1,24 @@ +package org.netspeak.io; + +public interface PhraseWriter extends AutoCloseable { + + /** + * Writes the given phrase and frequency. + * + * @param phrase + * @param frequency + * @throws Exception + */ + void write(String phrase, long frequency) throws Exception; + + /** + * Writes the given phrase-frequency-pair. + * + * @param pair + * @throws Exception + */ + default void write(PhraseFrequencyPair pair) throws Exception { + this.write(pair.phrase, pair.frequency); + } + +} diff --git a/src/main/java/org/netspeak/io/SimpleCsvReader.java b/src/main/java/org/netspeak/io/SimpleCsvReader.java new file mode 100644 index 0000000..a960cfe --- /dev/null +++ b/src/main/java/org/netspeak/io/SimpleCsvReader.java @@ -0,0 +1,61 @@ +package org.netspeak.io; + +import java.io.BufferedReader; +import java.io.IOException; + +/** + * A reader for simple CSV files. + *

+ * In these CSV files, every line ({@code \n}) contains a phrase followed by a + * single tab ({@code \t}) followed by the frequency of that phrase. There may + * be duplicate phrases. A phrase is a non-empty list of words each separated by + * a single whitespace ({@code \u0020}) with no leading or trailing spaces. + * + *

+ * hello world	20
+ * i love you	100
+ * hello world	5
+ * 
+ * + * @author Michael Schmidt + * + */ +public class SimpleCsvReader implements PhraseReader { + + private final BufferedReader reader; + + public SimpleCsvReader(BufferedReader reader) { + this.reader = reader; + } + + @Override + public PhraseFrequencyPair nextPair() throws IOException { + String line = reader.readLine(); + + if (line != null) { + // For better performance, we avoid String#split. Instead we know that a line + // only contains one \t, so we search for that index. To validate the format, we + // also search for a second \t. This is equivalent to: + // String[] parts = line.split("\t"); + // if (parts.length == 2) { create the pair } else { null } + int firstTab = line.indexOf('\t'); + int secondTab = line.indexOf('\t', firstTab + 1); + + // The first tab has to exist and it cannot be 0 because the phrase cannot be + // the empty string. The second tab has to not exist. + if (firstTab > 0 && secondTab == -1) { + String phrase = line.substring(0, firstTab); + long frequency = Long.parseLong(line.substring(firstTab + 1)); + return new PhraseFrequencyPair(phrase, frequency); + } + } + + return null; + } + + @Override + public void close() throws IOException { + reader.close(); + } + +} diff --git a/src/main/java/org/netspeak/io/SimpleCsvWriter.java b/src/main/java/org/netspeak/io/SimpleCsvWriter.java new file mode 100755 index 0000000..e92e968 --- /dev/null +++ b/src/main/java/org/netspeak/io/SimpleCsvWriter.java @@ -0,0 +1,31 @@ +package org.netspeak.io; + +import java.io.BufferedWriter; +import java.io.IOException; + +/** + * A writer for CSV files which can be understood by the Netspeak index builder. + *

+ * For more details on the format see {@link SimpleCsvReader}. + * + * @author Michael Schmidt + */ +public class SimpleCsvWriter implements PhraseWriter { + + private final BufferedWriter writer; + + public SimpleCsvWriter(BufferedWriter writer) { + this.writer = writer; + } + + @Override + public void write(String phrase, long frequency) throws IOException { + writer.append(phrase).append('\t').append(Long.toString(frequency)).append('\n'); + } + + @Override + public void close() throws IOException { + writer.close(); + } + +} diff --git a/src/main/java/org/netspeak/io/SplitterCsvWriter.java b/src/main/java/org/netspeak/io/SplitterCsvWriter.java new file mode 100644 index 0000000..37cfa0f --- /dev/null +++ b/src/main/java/org/netspeak/io/SplitterCsvWriter.java @@ -0,0 +1,85 @@ +package org.netspeak.io; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static java.nio.file.StandardOpenOption.CREATE_NEW; + +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.Writer; +import java.nio.charset.CharsetEncoder; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; + +/** + * This CSV writer will create a given number of CSV files which will be used as + * buckets where phrases will be assigned a bucket according to their hash. + * These bags can then be used for further processing. + *

+ * The {@link #write(String, long)} and {@link #write(PhraseFrequencyPair)} + * methods are thread-safe. + * + * @author Michael + * + */ +public class SplitterCsvWriter implements PhraseWriter { + + private final SimpleCsvWriter[] writers; + private final Path destDir; + private boolean initialized = false; + + public SplitterCsvWriter(Path destDir, int bucketCount) { + this.writers = new SimpleCsvWriter[bucketCount]; + this.destDir = destDir; + } + + @Override + public void close() throws Exception { + Exception last = null; + + for (SimpleCsvWriter writer : writers) { + try { + if (writer != null) + writer.close(); + } catch (Exception e) { + last = e; + } + } + + if (last != null) + throw last; + } + + @Override + public void write(String phrase, long frequency) throws IOException { + initializeWriters(); + + int index = phrase.hashCode() % writers.length; + if (index < 0) + index += writers.length; + SimpleCsvWriter writer = writers[index]; + synchronized (writer) { + writer.write(phrase, frequency); + } + } + + private final void initializeWriters() throws IOException { + if (initialized) + return; + synchronized (this) { + if (initialized) + return; + + for (int i = 0; i < writers.length; i++) { + Path path = Paths.get(destDir.toString(), String.valueOf(i) + ".csv"); + CharsetEncoder encoder = UTF_8.newEncoder(); + Writer writer = new OutputStreamWriter(Files.newOutputStream(path, CREATE_NEW), encoder); + writers[i] = new SimpleCsvWriter(new BufferedWriter(writer, 1024 * 256)); + } + + initialized = true; + } + } + +} diff --git a/src/main/java/org/netspeak/preprocessing/ContractionMapper.java b/src/main/java/org/netspeak/preprocessing/ContractionMapper.java new file mode 100755 index 0000000..b3cfdeb --- /dev/null +++ b/src/main/java/org/netspeak/preprocessing/ContractionMapper.java @@ -0,0 +1,246 @@ +package org.netspeak.preprocessing; + +import org.netspeak.Util; + +import java.io.IOException; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.function.Consumer; +import java.util.regex.Pattern; + +import static java.util.Locale.ENGLISH; + +public class ContractionMapper implements PhraseMapper { + + private final Pattern contractionPattern; + private final Pattern incompleteContractionPattern; + private final Map knownContractionMap = new HashMap<>(); + private static final Pattern POSSESSIVE_S_PATTERN = Pattern.compile("s '(?= |\\z)", Pattern.CASE_INSENSITIVE); + + public ContractionMapper(Path file) throws IOException { + this(Util.readWordList(file)); + } + + public ContractionMapper(Iterable knownContractions) { + StringBuilder pattern = new StringBuilder(); + Set incompleteContractionSuffixes = new HashSet<>(); + Set incompleteContractionPrefixes = new HashSet<>(); + + for (String known : knownContractions) { + // add know contractions ending with "n't" without ' to the map + // we can't do this for contractions like "we'll" or "i'm" because of the false + // positives + if (known.endsWith("n't")) { + for (String contraction : allCombinations(known)) { + int index = contraction.indexOf('\''); + knownContractionMap.put(contraction.replace("'", "").toLowerCase(ENGLISH), index); + } + } + + // add prefixes and suffixes to lists + int apo = known.indexOf('\''); + incompleteContractionPrefixes.addAll(allCombinations(known.substring(0, apo))); + incompleteContractionSuffixes.addAll(allCombinations(known.substring(apo + 1))); + + // make it all non-capturing for better performance + known = known.replace("\\((?!\\?)", "(?:"); + // replace the ' with all variations + known = known.replace("'", "(?: '|' | ' | )"); + + pattern.append(known); + pattern.append('|'); + } + pattern.append("[^\\s\\S]"); + + // contractionPattern + String finalPattern = "(?<= |\\A)(?:" + pattern.toString() + ")(?= |\\z)"; + // n't can be fixed with minimal context as it's the only contraction with both + // prefix and suffix + finalPattern += "|n(?: '|' | ' )t(?= |\\z)"; + // join possessive S + finalPattern += "|(?: '|' | ' )s(?= |\\z)"; + + contractionPattern = Pattern.compile(finalPattern, Pattern.CASE_INSENSITIVE); + + // incompleteContractionPattern + incompleteContractionPrefixes.remove(""); + incompleteContractionSuffixes.remove(""); + + StringBuilder incompletePattern = new StringBuilder(); + incompletePattern.append("(?:\\A| )(?:"); + incompletePattern.append(String.join("|", incompleteContractionPrefixes)); + incompletePattern.append(")(?: ?'\\z)"); + incompletePattern.append("|"); + incompletePattern.append("(?:\\A' ?)(?:"); + incompletePattern.append(String.join("|", incompleteContractionSuffixes)); + incompletePattern.append(")(?: |\\z)"); + + incompleteContractionPattern = Pattern.compile(incompletePattern.toString(), Pattern.CASE_INSENSITIVE); + } + + + @Override + public String map(String phrase, long frequency) { + // phrases with incomplete contractions will be removed + if (incompleteContractionPattern.matcher(phrase).find()) { + return null; + } + + + phrase = Util.replaceAll(contractionPattern, phrase, match -> { + String m = match.group(); + if (m.indexOf('\'') == -1) { + // e.g. "don t" + return m.replace(' ', '\''); + } else { + // e.g. "don' t" or "don 't" or "don ' t" + return m.replace(" ", ""); + } + }); + + String[] words = phrase.split(" "); + boolean changed = false; + for (int i = 0; i < words.length; i++) { + String word = words[i]; + String lowercase = word.toLowerCase(ENGLISH); + Integer ind = knownContractionMap.get(lowercase); + if (ind != null) { + int index = ind; + words[i] = word.substring(0, index) + '\'' + word.substring(index); + changed = true; + } + } + + phrase = changed ? Util.toPhrase(words) : phrase; + + phrase = POSSESSIVE_S_PATTERN.matcher(phrase).replaceAll("s'"); + + return phrase; + } + + private static List allCombinations(String pattern) { + List alternatives = new ArrayList<>(); + parseAlternation(pattern, 0, alternatives::add); + + List words = new ArrayList<>(); + for (Concatenation concat : alternatives) { + List builders = new ArrayList<>(); + builders.add(new StringBuilder()); + addCombinations(builders, concat); + builders.forEach(b -> words.add(b.toString())); + } + + return words; + } + + private static void addCombinations(List builders, Concatenation concat) { + for (Element e : concat.getElements()) { + if (e instanceof Literal) { + String value = ((Literal) e).toString(); + builders.forEach(b -> b.append(value)); + } else { + List alternatives = ((Alternation) e).getConcatenations(); + List original = new ArrayList<>(builders); + builders.clear(); + for (Concatenation alternative : alternatives) { + List newBuilders = new ArrayList<>(); + original.forEach(b -> newBuilders.add(new StringBuilder(b))); + addCombinations(newBuilders, alternative); + builders.addAll(newBuilders); + } + } + } + } + + private static int parseAlternation(String pattern, final int startIndex, Consumer consumeConcat) { + int index = startIndex; + + List concat = new ArrayList<>(); + + while (index < pattern.length()) { + char c = pattern.charAt(index++); + if (c == ')') + break; + if (c == '(') { + List alternatives = new ArrayList<>(); + index += parseAlternation(pattern, index, alternatives::add); + if (alternatives.size() == 1) { + concat.addAll(alternatives.get(0).getElements()); + } else { + concat.add(new Alternation(alternatives)); + } + } else if (c == '|') { + consumeConcat.accept(new Concatenation(concat)); + concat = new ArrayList<>(); + } else { + boolean added = false; + if (!concat.isEmpty()) { + Element last = concat.get(concat.size() - 1); + if (last instanceof Literal) { + ((Literal) last).append(c); + added = true; + } + } + if (!added) { + concat.add(new Literal(c)); + } + } + } + + consumeConcat.accept(new Concatenation(concat)); + + return index - startIndex; + } + + private interface Element { + } + + private static class Concatenation { + private final List elements; + + public Concatenation(List elements) { + this.elements = elements; + } + + public List getElements() { + return elements; + } + + } + + private static class Literal implements Element { + private String value; + + public Literal(char value) { + this.value = Character.toString(value); + } + + public void append(char c) { + this.value += c; + } + + @Override + public String toString() { + return value; + } + + } + + private static class Alternation implements Element { + private final List concatenations; + + public Alternation(List concatenations) { + this.concatenations = concatenations; + } + + public List getConcatenations() { + return concatenations; + } + } + +} diff --git a/src/main/java/org/netspeak/preprocessing/HyphenationJoiner.java b/src/main/java/org/netspeak/preprocessing/HyphenationJoiner.java new file mode 100755 index 0000000..e27a2f9 --- /dev/null +++ b/src/main/java/org/netspeak/preprocessing/HyphenationJoiner.java @@ -0,0 +1,337 @@ +package org.netspeak.preprocessing; + +import org.netspeak.Util; +import org.netspeak.Util.ThrowsConsumer; + +import java.io.IOException; +import java.io.PrintStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashSet; +import java.util.Set; + +import static java.util.Objects.requireNonNull; + +/** + * This will join all hyphenated words in two phases. + *

+ * In the first pass it will iterate over all input phrases and extract the vocabulary and stop words. This will be done + * according to the given options. + *

+ * The second phase is specific to the joiner set. Generally, it will try to normalize and, where possible, join + * hyphenated words. + * + * @see German + */ +public class HyphenationJoiner implements PipelineItem { + + private Path logFile; + private ThrowsConsumer vocabularyConsumer; + + private final Joiner joiner; + private final Path output; + private final PreprocessingOptions options; + + public HyphenationJoiner(JoinerFactory joinerFactory, Path output, PreprocessingOptions options) throws Exception { + this.joiner = requireNonNull(joinerFactory).create(); + this.output = requireNonNull(output); + this.options = requireNonNull(options); + } + + /** + * Sets an optional log file. + *

+ * Every action the method takes will be logged here in order where each line is one operation. The result and the + * reason for not joining will be in the following format: + * + *

+	 * {action}:[ {result}:] {phrase}
+	 * 
+ *

+ * The actions are and results are implementation defined and might be different for each language specific joiner. + * See the implementation of the current joiner for more information. + */ + public void setLogger(Path logFile) { + this.logFile = logFile; + } + + public void setVocabularyConsumer(ThrowsConsumer vocabularyConsumer) { + this.vocabularyConsumer = vocabularyConsumer; + } + + @Override + public PhraseSource apply(PhraseSource source) throws Exception { + // Pass 1 + + if (joiner.getRequiresVocabulary()) { + System.out.println("Extracting vocabulary..."); + + VocabularyExtractor vocabExtractor = new VocabularyExtractor(); + Preprocessing.iterate(source, Arrays.asList(vocabExtractor), options); + + System.out.println("Preparing vocabulary..."); + if (vocabularyConsumer != null) { + vocabularyConsumer.accept(vocabExtractor); + } + + Set vocabulary = vocabExtractor.getVocabulary(); + vocabExtractor = null; + System.gc(); + + joiner.setVocabulary(vocabulary); + } + + System.out.println("Joining Hyphenations..."); + options.setMergeDuplicates(true); // this operation is going to create duplicates + + // We use Java 8, so we have to give it a charset name, so it can lookup the charset instance. In never version + // you can give it an instance directly. + String charsetName = StandardCharsets.UTF_8.name(); + final PrintStream logger = logFile == null ? null : new PrintStream(logFile.toFile(), charsetName); + try { + joiner.setLogger(logger); + + return Preprocessing.process(source, output, Arrays.asList(joiner), options); + } finally { + if (logger != null) { + logger.close(); + } + } + } + + public interface Joiner extends PhraseMapper { + boolean getRequiresVocabulary(); + + void setVocabulary(Set vocabulary); + + void setLogger(PrintStream logger); + } + + public interface JoinerFactory { + Joiner create() throws Exception; + } + + public static class German implements JoinerFactory { + /** + * The top k words from the vocabulary will be treated as stop words. This will set the k. + */ + int stopWordsTopK = 100; + /** + * An optional stop word list. + *

+ * This list will be merged with the top k stop words from the vocabulary. + */ + Collection stopWordList = null; + + public void setStopWordsTopK(int stopWordsTopK) { + this.stopWordsTopK = stopWordsTopK; + } + + public void setStopWordList(Path stopWordList) throws IOException { + this.stopWordList = Util.readWordList(stopWordList); + } + + public void setStopWordList(Collection stopWordList) { + this.stopWordList = stopWordList; + } + + @Override + public Joiner create() throws Exception { + return new GermanJoiner(this); + } + } + + private static class GermanJoiner implements Joiner { + + private final German options; + + private Set vocabulary; + private Set stopWords = new HashSet<>(); + + private PrintStream logger; + + public GermanJoiner(German options) throws IOException { + this.options = requireNonNull(options); + if (options.stopWordList != null) + stopWords.addAll(options.stopWordList); + } + + @Override + public void setVocabulary(Set vocabulary) { + this.vocabulary = vocabulary; + HyphenationJoiner.addTopK(stopWords, vocabulary, options.stopWordsTopK); + } + + @Override + public void setLogger(PrintStream logger) { + this.logger = logger; + } + + @Override + public boolean getRequiresVocabulary() { + return true; + } + + private String[] normalizeHyphens(String[] words, String phrase) { + if (words.length < 2) + return words; + + int toRemove = 0; + for (int i = 1; i < words.length; i++) { + if ("-".contentEquals(words[i])) + toRemove++; + } + + if (toRemove == 0) + return words; + + String[] newWords = new String[words.length - toRemove]; + newWords[0] = words[0]; + int writeIndex = 1; + for (int i = 1; i < words.length; i++) { + String word = words[i]; + if ("-".contentEquals(words[i])) { + newWords[writeIndex - 1] = newWords[writeIndex - 1] + "-"; + } else { + newWords[writeIndex++] = word; + } + } + + if (logger != null) { + logger.println("Normalize: " + Util.toPhrase(newWords) + ": " + phrase); + } + + return newWords; + } + + private String[] joinHyphen(String[] words, String phrase) { + /** + * For all pairs matching the pattern `{words1}- {words2}`, we want to transform + * them to either `{words1}{words2}`, `{words1}-{words2}`, or leave them as is. + */ + + for (int i = 0; i < words.length - 1; i++) { + String word = words[i]; + String next = words[i + 1]; + if (word.length() > 1 && word.charAt(word.length() - 1) == '-') { + + // if the next word is a stop word, we leave it as is. + if (stopWords.contains(next)) { + if (logger != null) { + logger.println("Stop word: " + next + ": " + phrase); + } + continue; + } + + String result = null; + + /** + * To do the word join {word1}{word2}, 3 criteria have to be met: + * + * 1. {word2} can't be a stop word.
+ * 2. {word2} has to begin with a lower case letter.
+ * 3. The concatenation {word1}{word2} has to be a known word. + */ + + if (Character.isLowerCase(next.charAt(0))) { + String concat = word.substring(0, word.length() - 1) + next; + if (vocabulary.contains(concat)) { + result = concat; + if (logger != null) { + logger.println("Full join: " + concat + ": " + phrase); + } + } + } + + words[i] = null; + words[i + 1] = result == null ? word + next : result; + } + } + + return HyphenationJoiner.removeNull(words); + } + + @Override + public String map(String phrase, long frequency) { + if (phrase.indexOf('-') == -1) + return phrase; + + String[] words = normalizeHyphens(phrase.split(" "), phrase); + + words = joinHyphen(words, phrase); + + return Util.toPhrase(words); + } + + } + + public static class English implements JoinerFactory { + @Override + public Joiner create() throws Exception { + return new EnglishJoiner(); + } + } + + private static class EnglishJoiner implements Joiner { + + private PrintStream logger; + + @Override + public void setVocabulary(Set vocabulary) { + throw new UnsupportedOperationException(); + } + + @Override + public void setLogger(PrintStream logger) { + this.logger = logger; + } + + @Override + public boolean getRequiresVocabulary() { + return false; + } + + @Override + public String map(String phrase, long frequency) { + if (phrase.indexOf(" - ") == -1) + return phrase; + + String newPhrase = phrase.replace(" - ", "-"); + if (logger != null) { + logger.println("Join: " + newPhrase + ": " + phrase); + } + + return newPhrase; + } + + } + + private static String[] removeNull(String[] words) { + int nullEntries = 0; + for (String word : words) + if (word == null) + nullEntries++; + + if (nullEntries == 0) + return words; + + String[] newWords = new String[words.length - nullEntries]; + int writeIndex = 0; + for (String w : words) { + if (w != null) + newWords[writeIndex++] = w; + } + return newWords; + } + + private static void addTopK(Collection consumer, Collection supplier, int k) { + for (T item : supplier) { + if (k-- <= 0) + break; + consumer.add(item); + } + } + +} diff --git a/src/main/java/org/netspeak/preprocessing/Operations.java b/src/main/java/org/netspeak/preprocessing/Operations.java new file mode 100644 index 0000000..1d71602 --- /dev/null +++ b/src/main/java/org/netspeak/preprocessing/Operations.java @@ -0,0 +1,198 @@ +package org.netspeak.preprocessing; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; + +import org.netspeak.Util; + +public abstract class Operations { + + private Operations() { + } + + public static PipelineItem standardOperations(Path output, StandardOperationsOptions operationOptions, + PreprocessingOptions options) { + return source -> { + List mappers = new ArrayList<>(); + + // try to remove as much junk as possible + // In this phase, phrases will only be removed and not altered. + mappers.add(PhraseMappers.removeControlCharacters()); + if (operationOptions.superBlacklist != null) { + mappers.add(PhraseMappers.superBlacklist(operationOptions.superBlacklist)); + } + mappers.add(PhraseMappers.removeGoogleWebMarkers()); + mappers.add(PhraseMappers.removeHTMLEntities()); + mappers.add(PhraseMappers.removeURLsAndEmails()); + mappers.add(PhraseMappers.removeFileNames()); + + // Normalization phase + mappers.add(PhraseMappers.explodeCommas()); + mappers.add(PhraseMappers.removeLeadingDoubleQuote()); + mappers.add(PhraseMappers.joinWordsWithLeadingApostrophe()); + + if (operationOptions.blacklist != null) { + mappers.add( + PhraseMappers.blacklist(operationOptions.blacklist, operationOptions.blacklistCombinations)); + } + if (operationOptions.maxNGram < Integer.MAX_VALUE) { + mappers.add(PhraseMappers.maxNGram(operationOptions.maxNGram)); + } + if (operationOptions.toLowerCase) { + mappers.add(PhraseMappers.toLowerCase()); + } + + if (operationOptions.additionalMappers != null) { + mappers.addAll(operationOptions.additionalMappers); + } + + // the above operations are going to produce duplicates + options.setMergeDuplicates(true); + + return Preprocessing.process(source, output, mappers, options); + }; + } + + public static class StandardOperationsOptions { + /** + * The maximum allowed number of words per phrase. + */ + int maxNGram = Integer.MAX_VALUE; + /** + * Whether all phrases should be lower-cased. + */ + boolean toLowerCase = false; + /** + * All phrases with at least one word which can be constructed from at most + * {@link #blacklistCombinations} many blacklisted word will be removed. + */ + Collection blacklist = null; + int blacklistCombinations = 4; + /** + * @see PhraseMappers#superBlacklist(Iterable) + */ + Collection superBlacklist = null; + /** + * Additional mappers which will be executed after the mappers defined by the + * method. + */ + List additionalMappers = new ArrayList<>(); + + public void setBlacklist(Path blacklist) throws IOException { + this.blacklist = Util.readWordList(blacklist); + } + + public void setBlacklist(Collection blacklist) { + this.blacklist = blacklist; + } + + public void setBlacklistCombinations(int blacklistCombinations) { + this.blacklistCombinations = blacklistCombinations; + } + + public void setSuperBlacklist(Path superBlacklist) throws IOException { + this.superBlacklist = Util.readWordList(superBlacklist); + } + + public void setSuperBlacklist(Collection superBlacklist) { + this.superBlacklist = superBlacklist; + } + + public List getAdditionalMappers() { + return additionalMappers; + } + + public void setToLowerCase(boolean toLowerCase) { + this.toLowerCase = toLowerCase; + } + + public void setMaxNGram(int maxNGram) { + this.maxNGram = maxNGram; + } + } + + /** + * Moves all files to the given directory. + * + * @param output The directory to move to. + * @return + */ + public static PipelineItem moveTo(Path output) { + return source -> { + Path dest = output.toAbsolutePath(); + System.out.println("Moving to " + dest); + System.out.println("From:"); + System.out.println(source); + + Util.createEmptyDirectory(dest); + List newSources = new ArrayList<>(); + moveTo(newSources, source, dest); + + System.out.println("Done."); + + if (newSources.size() == 1) { + return newSources.get(0); + } else { + return PhraseSource.combine(newSources); + } + }; + } + + /** + * Moves all files to the given directory. + * + * @param output The directory to move to. + * @return + */ + public static PipelineItem moveTo(String output) { + return moveTo(Paths.get(output)); + } + + private static void moveTo(List out, PhraseSource source, Path dest) throws Exception { + if (source instanceof PhraseSource.Combined) { + for (PhraseSource s : ((PhraseSource.Combined) source).getSources()) { + moveTo(out, s, dest); + } + } else if (source instanceof SimplePhraseSource) { + SimplePhraseSource simple = (SimplePhraseSource) source; + // actually move some files + for (PhraseSource.File file : simple.getFiles()) { + Files.move(file.getPath(), dest.resolve(file.getPath().getFileName())); + } + + SimplePhraseSource newSource = new SimplePhraseSource(dest); + newSource.setReaderFactory(simple.readerFactory); + out.add(newSource); + } else { + throw new UnsupportedOperationException( + "Cannot move files of unknown source class " + source.getClass().getName()); + } + } + + /** + * Deletes all files of the input phrase source. + *

+ * The item will return {@link PhraseSource#EMPTY}. + * + * @return + */ + public static PipelineItem delete() { + return source -> { + System.out.println("Deleting:"); + System.out.println(source); + + for (PhraseSource.File file : source.getFiles()) { + Files.delete(file.getPath()); + } + + System.out.println("Done."); + return PhraseSource.EMPTY; + }; + } + +} diff --git a/src/main/java/org/netspeak/preprocessing/PhraseMapper.java b/src/main/java/org/netspeak/preprocessing/PhraseMapper.java new file mode 100755 index 0000000..908bdc7 --- /dev/null +++ b/src/main/java/org/netspeak/preprocessing/PhraseMapper.java @@ -0,0 +1,77 @@ +package org.netspeak.preprocessing; + +/** + * An interface providing a {@link #map(String, long)} function that transforms + * a given phrase. This interface can be used to apply certain string operations + * on phrases, such as case conversion or removal. Filter instances can be + * organized in some sort of collection to be applied one by one on the same + * phrase. + */ +@FunctionalInterface +public interface PhraseMapper { + + /** + * Maps a given input {@code phrase} to some output phrase. The returned phrase + * may be {@code null} or the empty string in which case the phrase will be + * removed from the corpus. + *

+ * The returned phrase is not allowed to contain tabs, line breaks, + * adjacent spaces, and leading or trailing spaces. + * + * @param phrase The input phrase string. This is guaranteed to not be + * {@code null} and to not be the empty string. + * @param frequency The phrase frequency. + * @return The filtered phrase string. + */ + String map(String phrase, long frequency); + + /** + * The name of the PhraseMapper. + *

+ * This name can be useful for diagnostics and will be used by + * {@link Preprocessing} when printing information about a {@link PhraseMapper}. + * By default this will be the name of the class of the mapper. + * + * @return + */ + default String getName() { + return getClass().getName(); + } + + /** + * Returns a new {@link PhraseMapper} which behaves like the given + * {@link PhraseMapper} and with the name of the full name of the caller method. + * + * @param mapper + * @return + */ + static PhraseMapper rename(PhraseMapper mapper) { + StackTraceElement[] stack = Thread.currentThread().getStackTrace(); + StackTraceElement caller = stack[2]; + return rename(caller.getClassName() + "." + caller.getMethodName(), mapper); + } + + /** + * Returns a new {@link PhraseMapper} with the given name which behaves like the + * given {@link PhraseMapper}. + * + * @param name + * @param mapper + * @return + */ + static PhraseMapper rename(String name, PhraseMapper mapper) { + return new PhraseMapper() { + + @Override + public String map(String phrase, long frequency) { + return mapper.map(phrase, frequency); + } + + @Override + public String getName() { + return name; + } + }; + } + +} diff --git a/src/main/java/org/netspeak/preprocessing/PhraseMappers.java b/src/main/java/org/netspeak/preprocessing/PhraseMappers.java new file mode 100755 index 0000000..74557e9 --- /dev/null +++ b/src/main/java/org/netspeak/preprocessing/PhraseMappers.java @@ -0,0 +1,436 @@ +package org.netspeak.preprocessing; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.function.Predicate; +import java.util.regex.Pattern; + +/** + * Some common {@link PhraseMapper} to be used in {@link Preprocessing}. + */ +public final class PhraseMappers { + + /** + * Returns a new {@link PhraseMapper} which converts phrases to lower case. + *

+ * Example: "You'll make it" becomes "you'll make it" + *

+ * + * @return + */ + public static PhraseMapper toLowerCase() { + return PhraseMapper.rename((phrase, frequency) -> phrase.toLowerCase()); + } + + /** + * Returns a new {@link PhraseMapper} which removes one leading double quote + * from a word. + *

+ * Example: "fo"o ""bar" will become fo"o "bar" and + * " foo will stay " foo + *

+ * + * @return + */ + public static PhraseMapper removeLeadingDoubleQuote() { + return PhraseMapper.rename((phrase, frequency) -> LEADING_DOUBLE_QUOTE_PATTERN.matcher(phrase).replaceAll("")); + } + + private static final Pattern LEADING_DOUBLE_QUOTE_PATTERN = Pattern + .compile("(?:(?!\\G)|\\A)(?:\\A|(?<= ))\"(?=[^ ])"); + + /** + * Returns a new {@link PhraseMapper} which joins two consecutive words within + * the phrase if the second word starts with an apostrophe. + *

+ * Example: "You 'll make it" will become + * "You'll make it" and "don 't" will become + * "don't" + *

+ * + * @return + */ + public static PhraseMapper joinWordsWithLeadingApostrophe() { + return PhraseMapper.rename((phrase, frequency) -> phrase.replace(" '", "'")); + } + + /** + * Returns a new {@link PhraseMapper} that removes phrases which contain at + * least one word that is contained in a given blacklist vocabulary. + * + * @param words + * @return + */ + public static PhraseMapper blacklist(final Collection words) { + return PhraseMapper.rename(blacklist(words, 1)); + } + + /** + * Returns a new {@link PhraseMapper} that removes phrases which contain at + * least one word that is an + *
HTML + * entity. + * + * @return + */ + public static PhraseMapper removeHTMLEntities() { + return PhraseMapper.rename(filterByWords(w -> !(w.charAt(0) == '&' && w.charAt(w.length() - 1) == ';'))); + } + + /** + * Removes all control characters. + * + * See + * here + * for more details. + * + * @return + */ + public static PhraseMapper removeControlCharacters() { + return PhraseMapper.rename((phrase, freq) -> { + int l = phrase.length(); + for (int i = 0; i < l; i++) { + char c = phrase.charAt(i); + if (c < ' ') // \x00 - \x1F + return null; + if (0x7F <= c && c <= 0x9F) // DEL, \x80 - \x9F + return null; + } + return phrase; + }); + } + + /** + * Returns a new {@link PhraseMapper} that removes phrases which contain at + * least one word that is contained in a given blacklist vocabulary. + *

+ * Phrases which contains a word which can be constructed by concatenating + * {@code <= repeating} many words from the blacklist will also be removed. I.e. + * if {@code "} and {@code ?} are in the blacklist and {@code repeating} is 3, + * then {@code """}, {@code "?"}, {@code "?}, and {@code ??} will all be + * removed. + *

+ * Please note that the blacklist will consume {@code O(n ** repeat)} + * many bytes of memory where {@code n} is the number of blacklist entries. + * + * @param words + * @return + */ + public static PhraseMapper blacklist(final Collection words, int repeat) { + HashSet tempBlacklist = new HashSet<>(); + tempBlacklist.addAll(words); + + // just to be safe + tempBlacklist.remove(null); + tempBlacklist.remove(""); + + if (repeat > 1) { + tempBlacklist = new HashSet<>(getAllCombinations(tempBlacklist, repeat)); + } + + // thanks Java + final Set blacklist = tempBlacklist; + + return PhraseMapper.rename(filterByWords(w -> !blacklist.contains(w))); + } + + private static List getAllCombinations(Collection words, int repeat) { + ArrayList combinations = new ArrayList<>((int) Math.pow(words.size(), repeat)); + combinations.addAll(words); + + int start = 0; + for (; repeat > 1; repeat--) { + int size = combinations.size(); + for (int i = start; i < size; i++) { + for (String word : words) { + combinations.add(combinations.get(i) + word); + } + } + start = size; + } + + return combinations; + } + + /** + * Returns a new {@link PhraseMapper} that removes phrases which contain at + * least one word that is not contained in an given whitelist vocabulary. + * + * @param words + * @return + */ + public static PhraseMapper whitelist(final Iterable words) { + final Set whitelist = new HashSet<>(); + for (String word : words) + whitelist.add(word); + + return PhraseMapper.rename(filterByWords(whitelist::contains)); + } + + /** + * Returns a {@link PhraseMapper} which filters out all words for which the + * given predicate returns {@code false}. + * + * @param wordPredicate + * @return + */ + public static PhraseMapper filterByWords(final Predicate wordPredicate) { + return PhraseMapper.rename((phrase, frequency) -> { + for (String word : phrase.split(" ")) { + if (!wordPredicate.test(word)) { + return null; + } + } + return phrase; + }); + } + + /** + * Similar to {@link PhraseMappers#blacklist(Collection)} with the difference + * being that all phrase which contain any of the given strings anywhere will be + * removed. + *

+ * E.g. A super blacklist with the string {@code "--"} will remove the phrase + * {@code "foo--bar"} while a normal blacklist will not. + * + * @param strings + * @return + */ + public static PhraseMapper superBlacklist(final Iterable strings) { + StringMatcherNode matcher = StringMatcherNode.createRoot(strings); + return PhraseMapper.rename((phrase, freq) -> { + int l = phrase.length(); + for (int i = 0; i < l; i++) { + if (matcher.matches(phrase, i)) { + return null; + } + } + return phrase; + }); + } + + private static class StringMatcherNode { + + private static final StringMatcherNode ACCEPT = new StringMatcherNode(true); + + private final StringMatcherNode[] next; + + private StringMatcherNode(boolean accept) { + next = accept ? null : new StringMatcherNode[65536]; + } + + public boolean matches(String s, int index) { + if (this == ACCEPT) + return true; + + StringMatcherNode node = this; + int length = s.length(); + for (int i = index; i < length; i++) { + if (node == ACCEPT) + return true; + + int c = s.charAt(i); + node = node.next[c]; + if (node == null) + return false; + } + return node == ACCEPT; + } + + public static StringMatcherNode createRoot(final Iterable words) { + final StringMatcherNode root = new StringMatcherNode(false); + + for (String word : words) { + int length = word.length(); + if (length == 0) + return ACCEPT; + + StringMatcherNode node = root; + for (int i = 0; i < length; i++) { + int c = word.charAt(i); + if (i + 1 == length) { + node.next[c] = ACCEPT; + } else { + StringMatcherNode current = node.next[c]; + if (current == ACCEPT) + break; + if (current == null) + current = node.next[c] = new StringMatcherNode(false); + node = current; + } + } + } + + return root; + } + + } + + /** + * Returns a new {@link PhraseMapper} that removes phrases whose frequency is + * less than a given minimum frequency. + * + * @return + */ + public static PhraseMapper removeIfFrequencyIsLessThan(final long minimumFrequency) { + return PhraseMapper.rename((phrase, frequency) -> frequency < minimumFrequency ? null : phrase); + } + + /** + * Returns a new {@link PhraseMapper} that removes phrases that contain at least + * one character that is not included in the Latin-1 character set (ISO/IEC + * 8859-1). The Latin-1 character set contains all characters with code points + * in the range [0, 255]. ASCII is a subset of Latin-1 that covers the range [0, + * 127]. Since Latin-1 characters are encoded in 8 bit they are full compatible + * with languages that use simple 1-byte character types such as C or C++. You + * need to apply this filter as long as the native Netspeak C++ implementation + * has no built-in Unicode support. + * + * @return + */ + public static PhraseMapper removeIfContainsNonLatin1Chars() { + final int maxLatin1CodePoint = 255; + + return PhraseMapper.rename((phrase, frequency) -> { + for (int i = 0; i != phrase.length(); ++i) { + if (phrase.codePointAt(i) > maxLatin1CodePoint) { + return null; + } + } + return phrase; + }); + } + + /** + * Returns a new {@link PhraseMapper} that removes phrases that contain URLs or + * email addresses. + * + * @return + */ + public static PhraseMapper removeURLsAndEmails() { + return PhraseMapper.rename((phrase, frequency) -> { + String lower = phrase.toLowerCase(); + + // check for Email addresses + if (EMAIL_PATTERN.matcher(lower).find()) + return null; + // matches the URL pattern + if (URL_PATTERN.matcher(lower).find()) + return null; + + return phrase; + }); + } + + // Email addresses can be right about anything which contains an @. + private static final Pattern EMAIL_PATTERN = Pattern.compile(".@."); + private static final String ALL_COUNTRY_TLD = "a[cdefgilmoqrstuwxz]|b[abdefghijmnorstwyz]|c[acdfghiklmnoruvwxyz]|d[ejkmoz]|e[cegrstu]|f[ijkmor]|g[adefghilmnpqrstuwy]|h[kmnrtu]|i[delmnoqrst]|j[emop]|k[eghimnprwyz]|l[abcikrstuvy]|m[acdeghklmnopqrstuvwxyz]|n[acefgilopruz]|om|p[aefghklmnrstwy]|qa|r[eosuw]|s[abcdeghiklmnorstuvxyz]|t[cdfghjklmnortvwz]|u[agksyz]|v[aceginu]|w[fs]|y[et]|z[amw]"; + // some of the more common domains + // https://w3techs.com/technologies/overview/top_level_domain/all + private static final Pattern URL_PATTERN = Pattern + .compile("www\\.|https?:|ftps?:|\\.(?:com|org|net|edu|gov|xyz|moe|club|online|pro|site|top|shop|info|biz|" + + ALL_COUNTRY_TLD + ")\\b"); + + /** + * Returns a new {@link PhraseMapper} that removes phrases that contain URLs or + * email addresses. + * + * @return + */ + public static PhraseMapper removeFileNames() { + return PhraseMapper.rename((phrase, frequency) -> { + String lower = phrase.toLowerCase(); + + if (FILE_NAME_PATTERN.matcher(lower).find()) + return null; + + return phrase; + }); + } + + private static final Pattern FILE_NAME_PATTERN = Pattern.compile( + "\\.(?:exe|dll|bin|msi|bat|com|jar|pkg|apk|ini|ai|ico|jpg|jpeg|png|gif|bmp|webp|tif|tag|ps|odp|pps|ppt|pptx|pdf|doc|docx|xml|csv|sql|zip|rar|tar|gz|7z|iso|webm|mov|mkv|mpg|mpeg|mp3|acc|ogg|wav|wmv|mid|midi|mp4|avi|vlc|html|htm|php|asp|aspx|js|css)\\b"); + + /** + * This removes all phrases with additional markers in the Google web corpus. + * This includes: {@code }, {@code }, {@code }, {@code }, + * {@code }, and {@code }. + * + * @return + */ + public static PhraseMapper removeGoogleWebMarkers() { + return PhraseMapper.rename(blacklist(Arrays.asList("", "", "", "", "", ""))); + } + + /** + * This will make surrounding commas some words have its own word. + *

+ * + *

+	 * "foo," -> "foo ,"
+	 * ",foo,," -> ", foo, ,"
+	 * 
+ * + * @return + */ + public static PhraseMapper splitSurroundingCommas() { + return PhraseMapper.rename((phrase, freq) -> { + String[] words = phrase.split(" "); + for (int i = 0; i < words.length; i++) { + String word = words[i]; + int l = word.length(); + if (l > 1 && (word.charAt(0) == ',' || word.charAt(l - 1) == ',')) { + if (word.contentEquals(",,")) { + words[i] = ", ,"; + } else { + if (word.charAt(0) == ',') { + word = ", " + word.substring(1); + } + if (word.charAt(l - 1) == ',') { + word = word.substring(0, l - 1) + " ,"; + } + } + } + } + return String.join(" ", words); + }); + } + + public static PhraseMapper explodeCommas() { + return PhraseMapper.rename((phrase, freq) -> { + if (phrase.indexOf(',') >= 0) { + return normalizeSpaces(phrase.replace(",", " , ")); + } + return phrase; + }); + } + + /** + * This will remove all phrase which have more than {@code n} words. + * + * @param n The maximum number of words allowed per phrase. + * @return + */ + public static PhraseMapper maxNGram(int n) { + return PhraseMapper.rename((phrase, freq) -> { + int words = 1; + int l = phrase.length(); + for (int i = 0; i < l; i++) { + if (phrase.charAt(i) == ' ') + words++; + } + return words > n ? null : phrase; + }); + } + + private static final Pattern SPACES_PATTERN = Pattern.compile("\\s{2,}"); + + private static String normalizeSpaces(String str) { + return SPACES_PATTERN.matcher(str).replaceAll(" ").trim(); + } + +} diff --git a/src/main/java/org/netspeak/preprocessing/PhraseSource.java b/src/main/java/org/netspeak/preprocessing/PhraseSource.java new file mode 100644 index 0000000..a12f73a --- /dev/null +++ b/src/main/java/org/netspeak/preprocessing/PhraseSource.java @@ -0,0 +1,87 @@ +package org.netspeak.preprocessing; + +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.List; + +import org.netspeak.io.PhraseReader; + +/** + * A source of phrases. + * + * @see SimplePhraseSource + */ +public interface PhraseSource { + + Collection getFiles() throws Exception; + + public interface File { + + Path getPath(); + + PhraseReader createReader() throws Exception; + + } + + PhraseSource EMPTY = combine(); + + /** + * Returns a phrase source which contains the files of all the given sources. + * + * @param sources + * @return + */ + static PhraseSource combine(PhraseSource... sources) { + return combine(Arrays.asList(sources)); + } + + /** + * Returns a phrase source which contains the files of all the given sources. + * + * @param sources + * @return + */ + static PhraseSource combine(Collection sources) { + final ArrayList src = new ArrayList<>(sources); + return new Combined() { + + @Override + public Collection getSources() { + return src; + } + + @Override + public Collection getFiles() throws Exception { + List files = new ArrayList<>(); + for (PhraseSource source : src) { + files.addAll(source.getFiles()); + } + return files; + } + + @Override + public String toString() { + StringBuilder sb = new StringBuilder(); + boolean first = true; + for (PhraseSource source : src) { + if (first) { + first = false; + } else { + sb.append("\n"); + } + sb.append(source.toString()); + } + return sb.toString(); + } + }; + } + + interface Combined extends PhraseSource { + + Collection getSources(); + + } + +} diff --git a/src/main/java/org/netspeak/preprocessing/Pipeline.java b/src/main/java/org/netspeak/preprocessing/Pipeline.java new file mode 100644 index 0000000..9f400bd --- /dev/null +++ b/src/main/java/org/netspeak/preprocessing/Pipeline.java @@ -0,0 +1,28 @@ +package org.netspeak.preprocessing; + +import java.util.ArrayList; +import java.util.List; + +import org.netspeak.Util.ThrowsSupplier; + +public class Pipeline implements PipelineItem { + + private final List items = new ArrayList<>(); + + public void add(PipelineItem item) { + items.add(item); + } + + public void add(ThrowsSupplier supplier) { + items.add(supplier.get()); + } + + @Override + public PhraseSource apply(PhraseSource source) throws Exception { + for (PipelineItem item : items) { + source = item.apply(source); + } + return source; + } + +} diff --git a/src/main/java/org/netspeak/preprocessing/PipelineItem.java b/src/main/java/org/netspeak/preprocessing/PipelineItem.java new file mode 100644 index 0000000..02194ed --- /dev/null +++ b/src/main/java/org/netspeak/preprocessing/PipelineItem.java @@ -0,0 +1,8 @@ +package org.netspeak.preprocessing; + +@FunctionalInterface +public interface PipelineItem { + + PhraseSource apply(PhraseSource source) throws Exception; + +} diff --git a/src/main/java/org/netspeak/preprocessing/Preprocessing.java b/src/main/java/org/netspeak/preprocessing/Preprocessing.java new file mode 100755 index 0000000..fe79176 --- /dev/null +++ b/src/main/java/org/netspeak/preprocessing/Preprocessing.java @@ -0,0 +1,371 @@ +package org.netspeak.preprocessing; + +import org.netspeak.Util; +import org.netspeak.Util.ThrowsRunnable; +import org.netspeak.io.PhraseFrequencyPair; +import org.netspeak.io.PhraseReader; +import org.netspeak.io.PhraseWriter; +import org.netspeak.io.SimpleCsvReader; +import org.netspeak.io.SimpleCsvWriter; +import org.netspeak.io.SplitterCsvWriter; +import org.netspeak.preprocessing.PreprocessingOptions.DeleteMode; + +import java.io.IOException; +import java.math.BigDecimal; +import java.math.RoundingMode; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.time.Duration; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Date; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Map.Entry; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.Future; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.atomic.LongAccumulator; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static java.util.Objects.requireNonNull; +import static java.util.concurrent.TimeUnit.DAYS; + +/** + *

+ * A class to convert a number of input phrase files to a number of output phrase files by applying user-defined filters + * on each input phrase. + *

+ * + *

+ * For that reason the user can register classes that implement the {@link PhraseMapper} interface to provide a certain + * filter functions. All filters will then be applied on each phrase in the same order they were registered. Some + * predefined {@link PhraseMapper} can be found in the {@link PhraseMappers} class. + *

+ */ +public final class Preprocessing { + + private Preprocessing() { + } + + /** + * Runs the entire preprocessing step which applies a number of filters on each phrase read from files located in + * {@code phraseSrcDir}. As a precondition all files in {@code phraseSrcDir} must be formatted according to the + * phrase file format as defined in {@code netspeak3-application-java-notes.txt}. + *

+ * .zip files will automatically be opened and processed. It's assumed that a .zip file contains only .csv file. + * + * @param outputDir A directory to store output phrase files. + * @param mappers A list of {@link PhraseMapper} objects. + * @throws IOException if any I/O errors occurs. + */ + public static PhraseSource process(PhraseSource input, Path outputDir, Collection mappers, + PreprocessingOptions options) throws Exception { + requireNonNull(input); + requireNonNull(outputDir); + requireNonNull(mappers); + requireNonNull(options); + + // make a copy of the procession options + options = new PreprocessingOptions(options); + long start = System.currentTimeMillis(); + + Util.createEmptyDirectory(outputDir); + + PhraseMapper[] mapperArray = mappers.toArray(new PhraseMapper[0]); + MapperStats[] stats = options.verbose ? createStats(mapperArray) : null; + + if (options.mergeDuplicates) { + Path tmp = outputDir.resolve("tmp"); + Util.createEmptyDirectory(tmp); + + // split all phrases by hash into different buckets such that duplicates are in + // the same bucket + try (SplitterCsvWriter writer = new SplitterCsvWriter(tmp, 1024)) { + System.out.println("Applying mappers."); + processAllFiles(options, input, file -> { + try (PhraseReader reader = file.createReader()) { + applyMappers(reader, writer, mapperArray, stats); + } + }); + } + + // use NetspeakCsvReader to read the output of SplitterNetspeakCsvWriter + SimplePhraseSource tmpSource = new SimplePhraseSource(tmp); + tmpSource.setReaderFactory(SimpleCsvReader::new); + + // delete temp files + options.setDeleteSource(DeleteMode.PROGRESSIVE); + + // use a simple HashMap to merge the duplicates + System.out.println("Merging phrases"); + AtomicLong totalPhrasesCount = new AtomicLong(0); + AtomicLong totalDuplicatesCount = new AtomicLong(0); + + processAllFiles(options, tmpSource, file -> { + Map map = new HashMap<>(); + try (PhraseReader reader = file.createReader()) { + long phrases = 0; + AtomicLong dups = new AtomicLong(0); + PhraseFrequencyPair pair; + while ((pair = reader.nextPair()) != null) { + phrases++; + map.merge(pair.phrase, pair.frequency, (a, b) -> { + dups.incrementAndGet(); + return a + b; + }); + } + totalPhrasesCount.addAndGet(phrases - dups.get()); + totalDuplicatesCount.addAndGet(dups.get()); + } + + // write map + Path out = outputDir.resolve(file.getPath().getFileName()); + try (SimpleCsvWriter writer = new SimpleCsvWriter(Files.newBufferedWriter(out, UTF_8))) { + for (Entry entry : map.entrySet()) { + writer.write(entry.getKey(), entry.getValue()); + } + } + }); + + double percentage = Math + .round(100. * 10. * totalDuplicatesCount.doubleValue() / totalPhrasesCount.doubleValue()) / 10.; + System.out.println("Total of " + totalPhrasesCount + " phrases with " + totalDuplicatesCount + " (" + + percentage + "%) duplicates merged."); + + // clean up + System.out.println("Deleting temporary directory"); + Files.delete(tmp); + } else { + + System.out.println("Applying mappers."); + processAllFiles(options, input, file -> { + String outFileName = file.getPath().getFileName().toString().replaceFirst("(?i).csv[^\\\\/]*", "") + + ".csv"; + Path out = outputDir.resolve(Paths.get(outFileName)); + try (PhraseReader reader = file.createReader(); + SimpleCsvWriter writer = new SimpleCsvWriter(Files.newBufferedWriter(out, UTF_8))) { + applyMappers(reader, writer, mapperArray, stats); + } + }); + } + + printStats(stats); + + long end = System.currentTimeMillis(); + System.out.println("Took " + readableDuration(Duration.ofMillis(end - start))); + System.out.println("Done."); + + return new SimplePhraseSource(outputDir); + } + + /** + * This will iterate over all phases just as {@link #process(PhraseSource, Path, Collection, PreprocessingOptions)} + * would but without changing the file system. + *

+ * All mappers can be thought of as consumers. + * + * @param mappers A list of {@link PhraseMapper} objects. + * @throws IOException if any I/O errors occurs. + */ + public static void iterate(PhraseSource input, Collection mappers, PreprocessingOptions options) + throws Exception { + requireNonNull(input); + requireNonNull(mappers); + requireNonNull(options); + + // make a copy of the procession options + options = new PreprocessingOptions(options); + options.setDeleteSource(DeleteMode.NONE); + long start = System.currentTimeMillis(); + + System.out.println("Applying mappers."); + PhraseMapper[] mapperArray = mappers.toArray(new PhraseMapper[0]); + MapperStats[] stats = options.verbose ? createStats(mapperArray) : null; + processAllFiles(options, input, file -> { + try (PhraseReader reader = file.createReader()) { + applyMappers(reader, null, mapperArray, stats); + } + }); + + printStats(stats); + + long end = System.currentTimeMillis(); + System.out.println("Took " + readableDuration(Duration.ofMillis(end - start))); + System.out.println("Done."); + } + + private static void processAllFiles(PreprocessingOptions options, PhraseSource input, ProcessAllConsumer consumer) + throws Exception { + ExecutorService executor = Executors.newFixedThreadPool(options.parallelDegree); + DeleteMode deleteSource = options.deleteSource; + try { + List> futures = new ArrayList<>(); + int i = 0; + Collection files = input.getFiles(); + for (final PhraseSource.File file : files) { + int currentIndex = ++i; + futures.add(executor.submit((ThrowsRunnable) () -> { + int percent = currentIndex * 100 / files.size(); + String prefix = "[" + new Date() + "][" + percent + "% " + currentIndex + "/" + files.size() + "] "; + System.out.println(prefix + "Preprocessing " + file); + + consumer.accept(file); + + if (deleteSource == DeleteMode.PROGRESSIVE) { + Files.delete(file.getPath()); + } + }, file.getPath())); + } + Util.getAll(futures); // wait for all tasks to complete + + if (deleteSource == DeleteMode.ATOMIC) { + for (final PhraseSource.File file : files) { + Files.delete(file.getPath()); + } + } + } finally { + executor.shutdown(); + executor.awaitTermination(100, DAYS); + } + } + + @FunctionalInterface + private interface ProcessAllConsumer { + + void accept(PhraseSource.File file) throws Exception; + + } + + private static void applyMappers(PhraseReader reader, PhraseWriter writer, PhraseMapper[] mappers, + MapperStats[] stats) throws Exception { + PhraseFrequencyPair pair; + while ((pair = reader.nextPair()) != null) { + String newPhrase = mapAll(pair.phrase, pair.frequency, mappers, stats); + if (newPhrase != null && writer != null) { + writer.write(newPhrase, pair.frequency); + } + } + } + + private static String mapAll(String phrase, long frequency, PhraseMapper[] mappers, MapperStats[] stats) { + if (stats == null) { + for (PhraseMapper mapper : mappers) { + if (phrase == null || phrase.isEmpty()) + return null; + phrase = mapper.map(phrase, frequency); + } + return phrase == null || phrase.isEmpty() ? null : phrase; + } else { + return mapAllWithStats(phrase, frequency, mappers, stats); + } + } + + private static String mapAllWithStats(String phrase, long frequency, PhraseMapper[] mappers, MapperStats[] stats) { + if (phrase == null || phrase.isEmpty()) + return null; + + for (int i = 0; i < mappers.length; i++) { + long start = System.nanoTime(); + String newPhrase = mappers[i].map(phrase, frequency); + long time = System.nanoTime() - start; + MapperStats s = stats[i]; + s.phrasesTotal.accumulate(1); + s.runTime.accumulate(time); + + if (newPhrase == null || newPhrase.isEmpty()) { + s.phrasesRemoved.accumulate(1); + return null; + } else { + if (phrase.contentEquals(newPhrase)) { + s.phrasesLeftUnchanged.accumulate(1); + } else { + s.phrasesChanged.accumulate(1); + phrase = newPhrase; + } + } + } + return phrase; + } + + private static MapperStats[] createStats(PhraseMapper[] mappers) { + MapperStats[] stats = new MapperStats[mappers.length]; + for (int i = 0; i < mappers.length; i++) { + stats[i] = new MapperStats(mappers[i]); + } + return stats; + } + + private static void printStats(MapperStats[] stats) { + if (stats == null) + return; + + System.out.println(); + for (MapperStats s : stats) { + long total = s.phrasesTotal.get(); + long changed = s.phrasesChanged.get(); + long kept = s.phrasesLeftUnchanged.get(); + long removed = s.phrasesRemoved.get(); + double runTime = s.runTime.get(); + + System.out.println("Mapper: " + s.mapper.getName()); + System.out.println(" total : " + padStart(total, 12)); + if (total > 0) { + double t = total; + System.out.println(" removed: " + padStart(removed, 12) + " (" + percent(removed / t, 2) + ")"); + System.out.println(" changed: " + padStart(changed, 12) + " (" + percent(changed / t, 2) + ")"); + System.out.println(" kept : " + padStart(kept, 12) + " (" + percent(kept / t, 2) + ")"); + System.out.println(" time/phrase: " + round(runTime / total, 2) + "ns/p"); + } + } + System.out.println(); + } + + private static String padStart(Object o, int length) { + String s = String.valueOf(o); + + if (s.length() >= length) + return s; + + char[] spaces = new char[length - s.length()]; + for (int i = 0; i < spaces.length; i++) { + spaces[i] = ' '; + } + + return new String(spaces) + s; + } + + private static String percent(double value, int precision) { + return round(value * 100, precision) + "%"; + } + + private static String round(double value, int precision) { + return BigDecimal.valueOf(value).setScale(precision, RoundingMode.HALF_UP).toString(); + } + + private static String readableDuration(Duration duration) { + return duration.toString().substring(2).replaceAll("(\\d[HMS])(?!$)", "$1 ").toLowerCase(); + } + + private static class MapperStats { + + public final PhraseMapper mapper; + public final LongAccumulator phrasesTotal = new LongAccumulator(Long::sum, 0); + public final LongAccumulator phrasesRemoved = new LongAccumulator(Long::sum, 0); + public final LongAccumulator phrasesChanged = new LongAccumulator(Long::sum, 0); + public final LongAccumulator phrasesLeftUnchanged = new LongAccumulator(Long::sum, 0); + /** + * The total run time of the mapper in ns. + */ + public final LongAccumulator runTime = new LongAccumulator(Long::sum, 0); + + public MapperStats(PhraseMapper mapper) { + this.mapper = mapper; + } + + } + +} diff --git a/src/main/java/org/netspeak/preprocessing/PreprocessingOptions.java b/src/main/java/org/netspeak/preprocessing/PreprocessingOptions.java new file mode 100644 index 0000000..d2cb2be --- /dev/null +++ b/src/main/java/org/netspeak/preprocessing/PreprocessingOptions.java @@ -0,0 +1,87 @@ +package org.netspeak.preprocessing; + +import static java.util.Objects.requireNonNull; + +public class PreprocessingOptions { + int parallelDegree = 1; + boolean mergeDuplicates = false; + DeleteMode deleteSource = DeleteMode.NONE; + boolean verbose = false; + + public PreprocessingOptions() { + } + + public PreprocessingOptions(PreprocessingOptions toCopy) { + parallelDegree = toCopy.parallelDegree; + mergeDuplicates = toCopy.mergeDuplicates; + deleteSource = toCopy.deleteSource; + verbose = toCopy.verbose; + } + + /** + * Sets the maximum number of concurrently processed files. + *

+ * This defaults {@code 1} meaning that files will processed in a single thread. + * + * @param parallelDegree + */ + public void setParallelDegree(int parallelDegree) { + this.parallelDegree = parallelDegree; + } + + /** + * Sets whether to merge duplicate phrases between and within files. + *

+ * This option is necessary if your phrases contain duplicates. + *

+ * This defaults to {@code false}. + * + * @param mergeDuplicates + */ + public void setMergeDuplicates(boolean mergeDuplicates) { + this.mergeDuplicates = mergeDuplicates; + } + + /** + * Sets whether the source files will be deleted after they were read. + *

+ * This option is useful to automatically remove temporary files. + *

+ * This defaults to {@link DeleteMode#NONE}. + * + * @param deleteSource + */ + public void setDeleteSource(DeleteMode deleteSource) { + this.deleteSource = requireNonNull(deleteSource); + } + + public enum DeleteMode { + /** + * No files will be deleted. + */ + NONE, + /** + * All files will be deleted at once after all files have been read. + */ + ATOMIC, + /** + * Files will be deleted as soon as possible. + */ + PROGRESSIVE + } + + /** + * Sets whether additional information about the preprocessing step should be + * logged in the console. + *

+ * Note: Enabling this might make the preprocessing slower. + *

+ * This defaults to {code false}. + * + * @param verbose + */ + public void setVerbose(boolean verbose) { + this.verbose = verbose; + } + +} diff --git a/src/main/java/org/netspeak/preprocessing/SimplePhraseSource.java b/src/main/java/org/netspeak/preprocessing/SimplePhraseSource.java new file mode 100644 index 0000000..3267865 --- /dev/null +++ b/src/main/java/org/netspeak/preprocessing/SimplePhraseSource.java @@ -0,0 +1,199 @@ +package org.netspeak.preprocessing; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static java.util.Objects.requireNonNull; + +import java.io.BufferedInputStream; +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.nio.file.DirectoryStream.Filter; +import java.nio.file.FileSystems; +import java.nio.file.FileVisitResult; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.PathMatcher; +import java.nio.file.Paths; +import java.nio.file.SimpleFileVisitor; +import java.nio.file.attribute.BasicFileAttributes; +import java.util.ArrayList; +import java.util.Collection; +import java.util.List; +import java.util.function.Function; +import java.util.zip.GZIPInputStream; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; + +import org.apache.commons.compress.compressors.CompressorStreamFactory; +import org.netspeak.io.PhraseReader; +import org.netspeak.io.SimpleCsvReader; + +public class SimplePhraseSource implements PhraseSource { + + final Path path; + Function readerFactory = SimpleCsvReader::new; + private Filter fileFilter; + + public SimplePhraseSource(Path path) { + this.path = requireNonNull(path); + } + + public SimplePhraseSource(String path) { + this.path = Paths.get(requireNonNull(path)); + } + + @Override + public String toString() { + return path.toString(); + } + + /** + * Sets the factory to create a new {@link PhraseReader} from the given + * {@link BufferedReader}. + *

+ * This defaults to {@code NetspeakCsvReader::new}. + * + * @param readerFactory + */ + public void setReaderFactory(Function readerFactory) { + this.readerFactory = requireNonNull(readerFactory); + } + + /** + * Sets a filter which decides whether a file will be processed. + *

+ * This defaults to {@code null} meaning that all files in the given directory + * will be processed. + * + * @param fileFilter + */ + public void setFileFilter(Filter fileFilter) { + this.fileFilter = fileFilter; + } + + /** + * Sets a glob pattern which decides whether a file will be processed. + *

+ * This defaults to {@code null} meaning that all files in the given directory + * will be processed. + * + * @param globPattern + */ + public void setFileFilter(String globPattern) { + if (globPattern == null) { + this.fileFilter = null; + } else { + final PathMatcher pathMatcher = FileSystems.getDefault().getPathMatcher(globPattern); + this.fileFilter = pathMatcher::matches; + } + } + + @Override + public Collection getFiles() throws Exception { + if (!Files.isDirectory(path)) { + throw new AssertionError("Not a directory " + path); + } + + List files = new ArrayList<>(); + SimplePhraseSource that = this; + + Files.walkFileTree(path, new SimpleFileVisitor() { + @Override + public FileVisitResult visitFile(Path path, BasicFileAttributes attrs) throws IOException { + if (fileFilter == null || fileFilter.accept(path)) { + files.add(new PhrasesSourceFile(that, path)); + } + return FileVisitResult.CONTINUE; + } + }); + + return files; + } + + private static class PhrasesSourceFile implements PhraseSource.File { + + private final SimplePhraseSource source; + private final Path path; + + public PhrasesSourceFile(SimplePhraseSource source, Path path) { + this.source = requireNonNull(source); + this.path = requireNonNull(path); + } + + @Override + public Path getPath() { + return path; + } + + @Override + public String toString() { + return path.toString(); + } + + @Override + public PhraseReader createReader() throws Exception { + BufferedReader br; + + String lowerPath = path.toString().toLowerCase(); + if (lowerPath.endsWith(".zip")) { + br = createZipReader(); + } else if (lowerPath.endsWith(".bz2")) { + br = createBz2Reader(); + } else if (lowerPath.endsWith(".gz")) { + br = createGZipReader(); + } else { + br = Files.newBufferedReader(path, UTF_8); + } + + try { + return source.readerFactory.apply(br); + } catch (Throwable e) { + br.close(); + throw e; + } + } + + private BufferedReader createZipReader() throws Exception { + // we assume that the .zip contains only one file which is a CSV file + BufferedInputStream bis = null; + ZipInputStream zip = null; + try { + bis = new BufferedInputStream(Files.newInputStream(path)); + zip = new ZipInputStream(bis); + ZipEntry entry = zip.getNextEntry(); + if (entry == null) { + throw new IllegalStateException("The .zip file is empty."); + } + if (!entry.getName().toLowerCase().endsWith(".csv")) { + throw new IllegalStateException("The .zip file is only allowed to contain a CSV file."); + } + return new BufferedReader(new InputStreamReader(zip, UTF_8)); + } catch (Throwable t) { + if (bis != null) + bis.close(); + if (zip != null) + zip.close(); + throw t; + } + } + + private BufferedReader createBz2Reader() throws Exception { + BufferedInputStream bis = null; + try { + bis = new BufferedInputStream(Files.newInputStream(path)); + return new BufferedReader( + new InputStreamReader(new CompressorStreamFactory().createCompressorInputStream(bis), UTF_8)); + } catch (Throwable t) { + if (bis != null) + bis.close(); + throw t; + } + } + + private BufferedReader createGZipReader() throws IOException { + return new BufferedReader(new InputStreamReader(new GZIPInputStream(Files.newInputStream(path)), UTF_8)); + } + + } + +} diff --git a/src/main/java/org/netspeak/preprocessing/VocabularyExtractor.java b/src/main/java/org/netspeak/preprocessing/VocabularyExtractor.java new file mode 100644 index 0000000..c3ed67f --- /dev/null +++ b/src/main/java/org/netspeak/preprocessing/VocabularyExtractor.java @@ -0,0 +1,99 @@ +package org.netspeak.preprocessing; + +import java.io.BufferedWriter; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.util.ArrayList; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.atomic.LongAccumulator; +import java.util.function.Function; + +import org.netspeak.io.PhraseFrequencyPair; +import org.netspeak.io.PhraseWriter; +import org.netspeak.io.SimpleCsvWriter; + +/** + * A phrase mapper that will create a vocabulary from all phrases it sees. + *

+ * This mapper will not change any phrases. + */ +public class VocabularyExtractor implements PhraseMapper { + + private Map vocabulary = new ConcurrentHashMap<>(); + private List list; + + private List getPairs() { + if (list == null) { + list = new ArrayList<>(); + vocabulary.forEach((phrase, counter) -> { + list.add(new PhraseFrequencyPair(phrase, counter.get())); + }); + vocabulary = null; + + list.sort((a, b) -> { + if (a.frequency > b.frequency) { + return -1; + } else if (a.frequency < b.frequency) { + return 1; + } + return a.phrase.compareTo(b.phrase); + }); + } + return list; + } + + @Override + public String map(String phrase, long frequency) { + for (String word : phrase.split(" ")) { + LongAccumulator counter = vocabulary.computeIfAbsent(word, key -> new LongAccumulator(Long::max, 0)); + counter.accumulate(frequency); + } + return phrase; + } + + public void writePairs(PhraseWriter writer) throws Exception { + for (PhraseFrequencyPair pair : getPairs()) { + writer.write(pair); + } + } + + public void writePairs(Path file) throws Exception { + writePairs(file, SimpleCsvWriter::new); + } + + public void writePairs(Path file, Function writerFactory) throws Exception { + try (BufferedWriter writer = Files.newBufferedWriter(file, StandardOpenOption.CREATE); + PhraseWriter phraseWriter = writerFactory.apply(writer)) { + writePairs(phraseWriter); + } + } + + public void writeVocabulary(BufferedWriter writer) throws IOException { + String newLine = "\n"; + for (PhraseFrequencyPair pair : getPairs()) { + writer.write(pair.phrase); + writer.write(newLine); + } + } + + public void writeVocabulary(Path file) throws IOException { + try (BufferedWriter writer = Files.newBufferedWriter(file, StandardOpenOption.CREATE)) { + writeVocabulary(writer); + } + } + + public Set getVocabulary() { + Set set = new LinkedHashSet<>(); + for (PhraseFrequencyPair pair : getPairs()) { + set.add(pair.phrase); + } + return set; + } + +} diff --git a/src/main/java/org/netspeak/usage/NetspeakBuilderUsage.java b/src/main/java/org/netspeak/usage/NetspeakBuilderUsage.java new file mode 100644 index 0000000..d553f3b --- /dev/null +++ b/src/main/java/org/netspeak/usage/NetspeakBuilderUsage.java @@ -0,0 +1,46 @@ +package org.netspeak.usage; + +import org.netspeak.Util; +import org.netspeak.preprocessing.ContractionMapper; +import org.netspeak.preprocessing.PhraseMapper; +import org.netspeak.preprocessing.PhraseMappers; +import org.netspeak.preprocessing.PhraseSource; +import org.netspeak.preprocessing.Pipeline; +import org.netspeak.preprocessing.Preprocessing; +import org.netspeak.preprocessing.PreprocessingOptions; +import org.netspeak.preprocessing.SimplePhraseSource; + +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Arrays; + +/** + * Demonstrates the usage of {@link Preprocessing}. + */ +public final class NetspeakBuilderUsage { + + public static void main(String[] args) throws Exception { + + Pipeline pipeline = new Pipeline(); + + pipeline.add(source -> { + Path output = Paths.get("C:\\Netspeak\\_out"); + + PreprocessingOptions options = new PreprocessingOptions(); + options.setParallelDegree(8); + options.setMergeDuplicates(true); + + Path conFile = Paths.get("D:\\netspeak\\contractions_eng.txt"); + ContractionMapper con = new ContractionMapper(conFile); + + PhraseMapper superBalcklist = PhraseMappers + .superBlacklist(Util.readWordList(Paths.get("D:\\netspeak\\super_blacklist.txt"))); + + return Preprocessing.process(source, output, Arrays.asList(superBalcklist, con), options); + }); + + SimplePhraseSource source1 = new SimplePhraseSource("C:\\Netspeak\\processed_corpora\\eng_ci_web+books"); + pipeline.apply(PhraseSource.combine(source1)); + } + +} diff --git a/src/main/java/org/netspeak/usage/NetspeakTerminal.java b/src/main/java/org/netspeak/usage/NetspeakTerminal.java new file mode 100755 index 0000000..92b299b --- /dev/null +++ b/src/main/java/org/netspeak/usage/NetspeakTerminal.java @@ -0,0 +1,61 @@ +package org.netspeak.usage; + +import java.io.OutputStreamWriter; +import java.io.PrintWriter; +import java.util.Scanner; + +import org.netspeak.Configuration; +import org.netspeak.Netspeak; +import org.netspeak.NetspeakUtil; +import org.netspeak.generated.NetspeakMessages.Request; +import org.netspeak.generated.NetspeakMessages.Response; + +/** + * Runs an interactive prompt to search Netspeak via command line. + */ +public class NetspeakTerminal { + + public static void main(String[] args) throws Exception { + + // --------------------------------------------------------------------- + // CONFIGURATION + // --------------------------------------------------------------------- + Configuration config = new Configuration(); + config.put(Configuration.PATH_TO_HOME, "/media/michael/Volume/data-in-production/netspeak/netspeak3-web-en"); + config.put(Configuration.CACHE_CAPACITY, "10000"); + + // --------------------------------------------------------------------- + // START NETSPEAK + // --------------------------------------------------------------------- + Netspeak netspeak = new Netspeak(config); + + // --------------------------------------------------------------------- + // TERMINAL INTERACTION + // --------------------------------------------------------------------- + PrintWriter pw = new PrintWriter(new OutputStreamWriter(System.out)); + + try (final Scanner scanner = new Scanner(System.in);) { + Request.Builder rb = Request.newBuilder(); + while (true) { + pw.print("\nEnter query (type 'q' to exit): "); + pw.flush(); + String query = scanner.nextLine(); + if (query.equals("q")) + break; + long start = System.currentTimeMillis(); + Request request = rb.setQuery(query).build(); + Response response = netspeak.search(request); + for (int i = 0; i != response.getPhraseCount(); ++i) { + System.out.printf("%-5d%-15d%s\n", i, response.getPhrase(i).getFrequency(), + NetspeakUtil.toString(response.getPhrase(i))); + } + System.out.println("Error code: " + response.getErrorCode()); + System.out.println("Error message: " + response.getErrorMessage()); + System.out.println("Tokenized query: " + String.join(" ", response.getQueryTokenList())); + System.out.println("Parsed query: " + NetspeakUtil.toString(response.getQuery())); + System.out.println("Time: " + (System.currentTimeMillis() - start)); + rb = request.toBuilder(); + } + } + } +} diff --git a/src/main/java/org/netspeak/usage/NetspeakUsage.java b/src/main/java/org/netspeak/usage/NetspeakUsage.java new file mode 100755 index 0000000..f8bb6aa --- /dev/null +++ b/src/main/java/org/netspeak/usage/NetspeakUsage.java @@ -0,0 +1,112 @@ +package org.netspeak.usage; + +import java.util.Map; + +import org.netspeak.Configuration; +import org.netspeak.ErrorCode; +import org.netspeak.Netspeak; +import org.netspeak.NetspeakUtil; +import org.netspeak.generated.NetspeakMessages.Request; +import org.netspeak.generated.NetspeakMessages.Response; + +import com.google.protobuf.InvalidProtocolBufferException; + +public class NetspeakUsage { + + public static void main(String[] args) { + + // --------------------------------------------------------------------- + // CONFIGURATION + // --------------------------------------------------------------------- + Configuration config = new Configuration(); + config.put(Configuration.PATH_TO_HOME, "/media/michael/Volume/data-in-production/netspeak/netspeak3-web-en"); + config.put(Configuration.CACHE_CAPACITY, "10000"); + + // --------------------------------------------------------------------- + // START NETSPEAK + // --------------------------------------------------------------------- + Netspeak netspeak = new Netspeak(config); + + // --------------------------------------------------------------------- + // SEARCH NETSPEAK + // --------------------------------------------------------------------- + Request.Builder rb = Request.newBuilder(); + rb.setQuery("programming is *"); + // Advanced parameters (optional) +// rb.setMaxPhraseCount(int); // default: 100 (find at most X n-grams) +// rb.setPhraseLengthMin(int); // default: 1 (minimum n-gram length) +// rb.setPhraseLengthMax(int); // default: 5 (maximum n-gram length) + + Request request = rb.build(); + Response response = null; + try { + response = netspeak.search(request); + } catch (InvalidProtocolBufferException e) { + e.printStackTrace(); + } + + // Tip: As you will see, there are no setter methods to prepare your + // request object with a new query for your next search. But you can + // and you should reuse your request object like that: + // request = request.toBuilder().setQuery("be efficient and ?").build(); + + // --------------------------------------------------------------------- + // ERROR HANDLING + // --------------------------------------------------------------------- + // A Netspeak search will never throw any exceptions. + // Errors are indicated by the response's error code. + System.out.println("Error: " + response.getErrorCode()); + switch (ErrorCode.fromCode(response.getErrorCode())) { + case NO_ERROR: + // ... + break; + case INVALID_QUERY: + // ... + break; + case SERVER_ERROR: + // ... + break; + case UNKNOWN_ERROR: + // ... + break; + } + + // You can also handle errors like this: + // if (ErrorCode.cast(response.getErrorCode()) != ErrorCode.NO_ERROR) + + // --------------------------------------------------------------------- + // READ RESPONSE + // --------------------------------------------------------------------- + // Returns the total frequency (100% basis) of the returned n-grams. + // This is not the same value as the sum of all n-gram frequencies. + System.out.println("Total frequency: " + response.getTotalFrequency()); + // Returns the tokenized query string produced by the query lexer. + System.out.println("Tokenized query: " + String.join(" ", response.getQueryTokenList())); + // Returns the parsed (valid) query produced by the query parser. + System.out.println("Parsed query: " + NetspeakUtil.toString(response.getQuery())); + // Returns the request object. + System.out.println("Request was: " + response.getRequest()); + + // Loop through the returned phrases + for (int i = 0; i != response.getPhraseCount(); ++i) { + System.out.printf("%-5d%-15d%s\n", i, response.getPhrase(i).getFrequency(), + response.getPhrase(i).toString()); + } + + // You can also iterate like that: + // for (Phrase phrase : response.getPhraseList()) { + // System.out.println(phrase); // Complete phrase in JSON style + // } + + // --------------------------------------------------------------------- + // NETSPEAK PROPERTIES (Some interesting values) + // --------------------------------------------------------------------- + try { + for (Map.Entry entry : netspeak.getProperties().entrySet()) { + System.out.println(entry); + } + } catch (InvalidProtocolBufferException e) { + e.printStackTrace(); + } + } +} diff --git a/src/main/java/org/netspeak/usage/PreprocessingUsage.java b/src/main/java/org/netspeak/usage/PreprocessingUsage.java new file mode 100644 index 0000000..b42f00f --- /dev/null +++ b/src/main/java/org/netspeak/usage/PreprocessingUsage.java @@ -0,0 +1,152 @@ +package org.netspeak.usage; + +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.Arrays; + +import org.netspeak.Util; +import org.netspeak.Util.ThrowsRunnable; +import org.netspeak.preprocessing.ContractionMapper; +import org.netspeak.preprocessing.HyphenationJoiner; +import org.netspeak.preprocessing.Operations; +import org.netspeak.preprocessing.Operations.StandardOperationsOptions; +import org.netspeak.preprocessing.PhraseMappers; +import org.netspeak.preprocessing.PhraseSource; +import org.netspeak.preprocessing.Pipeline; +import org.netspeak.preprocessing.Preprocessing; +import org.netspeak.preprocessing.PreprocessingOptions; +import org.netspeak.preprocessing.PreprocessingOptions.DeleteMode; +import org.netspeak.preprocessing.SimplePhraseSource; + +public class PreprocessingUsage { + + /* + * You have to specify two temporary directories. + * + * Ideally, these should be your fastest storage capable of holding the whole + * data set of a pipeline. This means you can read the data from a HDD, process + * it on an SSD. + */ + + static Path temp1 = Paths.get("path/to/temp1"); + static Path temp2 = Paths.get("path/to/temp2"); + + public static void main(String[] args) throws Exception { + useTemp(() -> { + + PhraseSource german = new SimplePhraseSource("path/to/german/data"); + processGerman(german, Paths.get("out/german")); + + }); + } + + private static void useTemp(ThrowsRunnable runnable) throws Exception { + // clear temporary directories before and after pre-processing + Util.delete(temp1, true); + Util.delete(temp2, true); + try { + runnable.runThrowing(); + } finally { + Util.delete(temp1, true); + Util.delete(temp2, true); + } + } + + /** + * + * @throws Exception + */ + static void processGerman(PhraseSource source, Path outDir) throws Exception { + Pipeline pipeline = new Pipeline(); + + pipeline.add(() -> { + Path output = temp1; + + StandardOperationsOptions operationOptions = new StandardOperationsOptions(); + operationOptions.setSuperBlacklist(Util.readResourceWordList("super-blacklist.txt")); + operationOptions.setBlacklist(Util.readResourceWordList("blacklist.txt")); + operationOptions.setBlacklistCombinations(4); + operationOptions.setMaxNGram(5); + operationOptions.setToLowerCase(false); + + operationOptions.getAdditionalMappers() + .add(new ContractionMapper(Util.readResourceWordList("eng/contractions.txt"))); + + PreprocessingOptions options = new PreprocessingOptions(); + options.setParallelDegree(8); + + return Operations.standardOperations(output, operationOptions, options); + }); + + pipeline.add(() -> { + Path output = temp2; + + PreprocessingOptions options = new PreprocessingOptions(); + options.setParallelDegree(8); + options.setDeleteSource(DeleteMode.PROGRESSIVE); // delete files from temp + + HyphenationJoiner.German german = new HyphenationJoiner.German(); + german.setStopWordList(Util.readResourceWordList("ger/stop-words.txt")); + + return new HyphenationJoiner(german, output, options); + }); + + pipeline.add(Operations.moveTo(outDir)); + + pipeline.apply(source); + } + + static void processEnglish(PhraseSource source, Path outDir) throws Exception { + Pipeline pipeline = new Pipeline(); + + pipeline.add(() -> { + Path output = temp1; + + StandardOperationsOptions operationOptions = new StandardOperationsOptions(); + operationOptions.setSuperBlacklist(Util.readResourceWordList("super-blacklist.txt")); + operationOptions.setBlacklist(Util.readResourceWordList("blacklist.txt")); + operationOptions.setBlacklistCombinations(4); + operationOptions.setMaxNGram(5); + operationOptions.setToLowerCase(false); + + PreprocessingOptions options = new PreprocessingOptions(); + options.setParallelDegree(8); + + return Operations.standardOperations(output, operationOptions, options); + }); + + pipeline.add(() -> { + Path output = temp2; + + PreprocessingOptions options = new PreprocessingOptions(); + options.setParallelDegree(8); + options.setDeleteSource(DeleteMode.PROGRESSIVE); // delete files from temp + + HyphenationJoiner.English english = new HyphenationJoiner.English(); + + return new HyphenationJoiner(english, output, options); + }); + + pipeline.add(Operations.moveTo(outDir)); + + pipeline.apply(source); + } + + static void toLowerCase(PhraseSource source, Path outDir) throws Exception { + Pipeline pipeline = new Pipeline(); + + pipeline.add(inputSource -> { + Path output = temp2; + + PreprocessingOptions options = new PreprocessingOptions(); + options.setParallelDegree(8); + + return Preprocessing.process(inputSource, output, Arrays.asList(PhraseMappers.toLowerCase()), options); + }); + + pipeline.add(Operations.moveTo(outDir)); + + pipeline.apply(source); + } + +} diff --git a/src/main/resources/blacklist.txt b/src/main/resources/blacklist.txt new file mode 100644 index 0000000..96f9ba1 --- /dev/null +++ b/src/main/resources/blacklist.txt @@ -0,0 +1,67 @@ +' +" +„ +“ +„ +” +« +» +` +´ + +-- +--- +---- ++ += +~ +* +% +# +.. +... +.... +: +| +( +) +[ +] +{ +} +< +> +^ +@ +/ +\ +& + +$ +€ +£ +¥ + +Ã +¼ +¤ + +² +³ + +— +• +■ +¬ +→ +· +_ +… +® +© +█ +™ +♥ +Ȳ +¶ +± diff --git a/src/main/resources/eng/contractions.txt b/src/main/resources/eng/contractions.txt new file mode 100644 index 0000000..4c1baca --- /dev/null +++ b/src/main/resources/eng/contractions.txt @@ -0,0 +1,9 @@ +i'm +(he|she|it)'s +(you|we|they)'re + +(i|you|he|she|it|we|they)'(d|ll|ve) + +y'all + +(have|has|had|do|does|did|is|are|ai|was|were|wo|would|ca|could|sha|must|need)n't diff --git a/src/main/resources/ger/stop-words.txt b/src/main/resources/ger/stop-words.txt new file mode 100644 index 0000000..310c68e --- /dev/null +++ b/src/main/resources/ger/stop-words.txt @@ -0,0 +1,68 @@ +, +der +und +die +in +von +zu +den +mit +für +des +das +auf +nicht +im +sich +dem +eine +ein +! +an +auch +als +bei +? +oder +aus +nach +zum +einer +zur +wie +so +nur +über +durch +um +am +einen +aber +noch +mehr +einem +bis +dass +vor +daß +dieser +wenn +diese +vom +hier +unter +dann +was +keine +eines +ab +da +schon +sehr +diesem +sowie + +u. +bzw +bzw. + diff --git a/src/main/resources/super-blacklist.txt b/src/main/resources/super-blacklist.txt new file mode 100755 index 0000000..e69de29 diff --git a/src/test/java/org/netspeak/preprocessing/ContractionMapperTest.java b/src/test/java/org/netspeak/preprocessing/ContractionMapperTest.java new file mode 100644 index 0000000..00905da --- /dev/null +++ b/src/test/java/org/netspeak/preprocessing/ContractionMapperTest.java @@ -0,0 +1,94 @@ +package org.netspeak.preprocessing; + +import java.util.ArrayList; +import java.util.List; +import java.util.function.BiConsumer; + +import org.junit.Assert; +import org.junit.Test; + +public class ContractionMapperTest { + + public List getContractionPatterns() { + List patterns = new ArrayList<>(); + + patterns.add("i'm"); + patterns.add("(he|she|it)'s"); + patterns.add("(you|we|they)'re"); + patterns.add("(i|you|he|she|it|we|they)'(d|ll|ve)"); + patterns.add("y'all"); + patterns.add("(have|has|had|do|does|did|is|are|ai|was|were|wo|would|ca|could|sha|must|need)n't"); + + return patterns; + } + + @Test + public void contractionTest() { + final ContractionMapper mapper = new ContractionMapper(getContractionPatterns()); + + BiConsumer test = (from, to) -> { + String actual = mapper.map(from, 100); + if (actual == to) + return; + if (to == null || actual == null || !to.contentEquals(actual)) { + Assert.fail("\"" + from + "\" was expected to map to \"" + to + "\" but was actually mapped to \"" + + actual + "\"."); + } + }; + + test.accept("Tom", "Tom"); + test.accept("Tom's bar", "Tom's bar"); + test.accept("Tom 's bar", "Tom's bar"); + test.accept("Tom ' s bar", "Tom's bar"); + test.accept("Tom' s bar", "Tom's bar"); + test.accept("Tom s bar", "Tom s bar"); // too little context, so leave it as is + + test.accept("Charls' phone", "Charls' phone"); + test.accept("Charls ' phone", "Charls' phone"); + test.accept("Charls '", "Charls'"); + test.accept("Charls 't", "Charls 't"); + + test.accept("he's nice", "he's nice"); + test.accept("he' s nice", "he's nice"); + test.accept("he ' s nice", "he's nice"); + test.accept("he 's nice", "he's nice"); + test.accept("he s nice", "he's nice"); + + test.accept("we'll do it", "we'll do it"); + test.accept("we 'll do it", "we'll do it"); + test.accept("we ' ll do it", "we'll do it"); + test.accept("we' ll do it", "we'll do it"); + test.accept("we ll do it", "we'll do it"); + test.accept("well do it", "well do it"); // well well well + + test.accept("dont", "don't"); + test.accept("don't", "don't"); + test.accept("don 't", "don't"); + test.accept("don ' t", "don't"); + test.accept("don' t", "don't"); + test.accept("don t", "don't"); + + test.accept("DoNt", "DoN't"); + test.accept("DoN't", "DoN't"); + test.accept("DoN 't", "DoN't"); + test.accept("DoN ' t", "DoN't"); + test.accept("DoN' t", "DoN't"); + test.accept("DoN t", "DoN't"); + + test.accept("I'm", "I'm"); + test.accept("I 'm", "I'm"); + test.accept("I ' m", "I'm"); + test.accept("I' m", "I'm"); + test.accept("I m", "I'm"); + + test.accept("I might", "I might"); + + test.accept("won", "won"); + test.accept("won'", null); + test.accept("won '", null); + test.accept("'t open", null); + test.accept("' t open", null); + test.accept("t open", "t open"); // could be real + } + +} diff --git a/src/test/java/org/netspeak/preprocessing/PhraseMappersTest.java b/src/test/java/org/netspeak/preprocessing/PhraseMappersTest.java new file mode 100644 index 0000000..96c9339 --- /dev/null +++ b/src/test/java/org/netspeak/preprocessing/PhraseMappersTest.java @@ -0,0 +1,137 @@ +package org.netspeak.preprocessing; + +import static org.junit.Assert.assertEquals; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; + +import org.junit.Test; + +public class PhraseMappersTest { + + private void phraseMapperTest(PhraseMapper mapper, Collection unchanged, Collection removed, + Map changed) { + String name = mapper.getName(); + + if (unchanged != null) { + for (String expected : unchanged) { + String actual = mapper.map(expected, 100); + assertEquals("Expected unchanged for " + name, expected, actual); + } + } + + if (removed != null) { + for (String expected : removed) { + String actual = mapper.map(expected, 100); + assertEquals("Expected removed for " + name, null, actual); + } + } + + if (changed != null) { + for (Map.Entry transform : changed.entrySet()) { + String actual = mapper.map(transform.getKey(), 100); + assertEquals("Expected changed for " + name, transform.getValue(), actual); + } + } + } + + @Test + public void blacklist() { + Set blacklistedWords = new HashSet<>(); + for (String word : ". - ( ) \" '".split(" ")) { + blacklistedWords.add(word); + } + + final Collection sharedUnchanged = new ArrayList<>(); + sharedUnchanged.add("foo bar"); + sharedUnchanged.add("foo-bar"); + sharedUnchanged.add("Dr."); + + final Collection sharedRemoved = new ArrayList<>(); + sharedRemoved.add("."); + sharedRemoved.add("."); + sharedRemoved.add("("); + sharedRemoved.add(")"); + sharedRemoved.add("-"); + sharedRemoved.add("foo -"); + sharedRemoved.add("- foo"); + sharedRemoved.add("- foo -"); + sharedRemoved.add("foo - bar"); + + { + final PhraseMapper mapper = PhraseMappers.blacklist(blacklistedWords, 1); + + final Collection unchanged = new ArrayList<>(sharedUnchanged); + unchanged.add("()"); + + final Collection removed = new ArrayList<>(sharedRemoved); + + phraseMapperTest(mapper, unchanged, removed, null); + } + { + final PhraseMapper mapper = PhraseMappers.blacklist(blacklistedWords, 4); + + final Collection unchanged = new ArrayList<>(); + unchanged.add("()()-"); + + final Collection removed = new ArrayList<>(); + removed.add("()()"); + removed.add("-.-."); + removed.add("-.-. foo"); + removed.add("foo -.-. foo"); + + phraseMapperTest(mapper, unchanged, removed, null); + } + } + + @Test + public void superBlacklist() { + Set blacklistedWords = new HashSet<>(); + for (String word : ". - ( ) \" '".split(" ")) { + blacklistedWords.add(word); + } + + final Collection sharedUnchanged = new ArrayList<>(); + sharedUnchanged.add("foo bar"); + sharedUnchanged.add("foo-bar"); + sharedUnchanged.add("Dr."); + + final Collection sharedRemoved = new ArrayList<>(); + sharedRemoved.add("."); + sharedRemoved.add("."); + sharedRemoved.add("("); + sharedRemoved.add(")"); + sharedRemoved.add("-"); + sharedRemoved.add("foo -"); + sharedRemoved.add("- foo"); + sharedRemoved.add("- foo -"); + sharedRemoved.add("foo - bar"); + + { + final PhraseMapper mapper = PhraseMappers.superBlacklist(blacklistedWords); + + final Collection unchanged = new ArrayList<>(); + unchanged.add("foo bar"); + + final Collection removed = new ArrayList<>(); + removed.add("."); + removed.add("."); + removed.add("("); + removed.add(")"); + removed.add("-"); + removed.add("foo -"); + removed.add("- foo"); + removed.add("- foo -"); + removed.add("foo - bar"); + + removed.add("foo-bar"); + removed.add("Dr."); + + phraseMapperTest(mapper, unchanged, removed, null); + } + } + +}