From c126c87818eb06aa5c2ac23b362d504f342c72b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Tue, 15 Mar 2016 00:22:02 +0200 Subject: [PATCH 01/22] add language files --- .../danish.txt | 94 ++++++ .../dutch.txt | 101 ++++++ .../english.txt | 319 ++++++++++++++++++ .../finnish.txt | 235 +++++++++++++ .../french.txt | 155 +++++++++ .../german.txt | 231 +++++++++++++ .../hungarian.txt | 199 +++++++++++ .../italian.txt | 279 +++++++++++++++ .../norwegian.txt | 176 ++++++++++ .../portuguese.txt | 203 +++++++++++ .../russian.txt | 151 +++++++++ .../spanish.txt | 313 +++++++++++++++++ .../swedish.txt | 114 +++++++ .../turkish.txt | 53 +++ 14 files changed, 2623 insertions(+) create mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/danish.txt create mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/dutch.txt create mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/english.txt create mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/finnish.txt create mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/french.txt create mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/german.txt create mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/hungarian.txt create mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/italian.txt create mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/norwegian.txt create mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/portuguese.txt create mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/russian.txt create mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/spanish.txt create mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/swedish.txt create mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/turkish.txt diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/danish.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/danish.txt new file mode 100644 index 0000000000000..d3edc6757912e --- /dev/null +++ b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/danish.txt @@ -0,0 +1,94 @@ +og +i +jeg +det +at +en +den +til +er +som +på +de +med +han +af +for +ikke +der +var +mig +sig +men +et +har +om +vi +min +havde +ham +hun +nu +over +da +fra +du +ud +sin +dem +os +op +man +hans +hvor +eller +hvad +skal +selv +her +alle +vil +blev +kunne +ind +når +være +dog +noget +ville +jo +deres +efter +ned +skulle +denne +end +dette +mit +også +under +have +dig +anden +hende +mine +alt +meget +sit +sine +vor +mod +disse +hvis +din +nogle +hos +blive +mange +ad +bliver +hendes +været +thi +jer +sådan diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/dutch.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/dutch.txt new file mode 100644 index 0000000000000..cafa0324b5376 --- /dev/null +++ b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/dutch.txt @@ -0,0 +1,101 @@ +de +en +van +ik +te +dat +die +in +een +hij +het +niet +zijn +is +was +op +aan +met +als +voor +had +er +maar +om +hem +dan +zou +of +wat +mijn +men +dit +zo +door +over +ze +zich +bij +ook +tot +je +mij +uit +der +daar +haar +naar +heb +hoe +heeft +hebben +deze +u +want +nog +zal +me +zij +nu +ge +geen +omdat +iets +worden +toch +al +waren +veel +meer +doen +toen +moet +ben +zonder +kan +hun +dus +alles +onder +ja +eens +hier +wie +werd +altijd +doch +wordt +wezen +kunnen +ons +zelf +tegen +na +reeds +wil +kon +niets +uw +iemand +geweest +andere diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/english.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/english.txt new file mode 100644 index 0000000000000..61e5350dcde39 --- /dev/null +++ b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/english.txt @@ -0,0 +1,319 @@ +a +about +above +across +after +afterwards +again +against +all +almost +alone +along +already +also +although +always +am +among +amongst +amoungst +amount +an +and +another +any +anyhow +anyone +anything +anyway +anywhere +are +around +as +at +back +be +became +because +become +becomes +becoming +been +before +beforehand +behind +being +below +beside +besides +between +beyond +bill +both +bottom +but +by +call +can +cannot +cant +co +computer +con +could +couldnt +cry +de +describe +detail +do +done +down +due +during +each +eg +eight +either +eleven +else +elsewhere +empty +enough +etc +even +ever +every +everyone +everything +everywhere +except +few +fifteen +fify +fill +find +fire +first +five +for +former +formerly +forty +found +four +from +front +full +further +get +give +go +had +has +hasnt +have +he +hence +her +here +hereafter +hereby +herein +hereupon +hers +herself +him +himself +his +how +however +hundred +i +ie +if +in +inc +indeed +interest +into +is +it +its +itself +keep +last +latter +latterly +least +less +ltd +made +many +may +me +meanwhile +might +mill +mine +more +moreover +most +mostly +move +much +must +my +myself +name +namely +neither +never +nevertheless +next +nine +no +nobody +none +noone +nor +not +nothing +now +nowhere +of +off +often +on +once +one +only +onto +or +other +others +otherwise +our +ours +ourselves +out +over +own +part +per +perhaps +please +put +rather +re +same +see +seem +seemed +seeming +seems +serious +several +she +should +show +side +since +sincere +six +sixty +so +some +somehow +someone +something +sometime +sometimes +somewhere +still +such +system +take +ten +than +that +the +their +them +themselves +then +thence +there +thereafter +thereby +therefore +therein +thereupon +these +they +thick +thin +third +this +those +though +three +through +throughout +thru +thus +to +together +too +top +toward +towards +twelve +twenty +two +un +under +until +up +upon +us +very +via +was +we +well +were +what +whatever +when +whence +whenever +where +whereafter +whereas +whereby +wherein +whereupon +wherever +whether +which +while +whither +who +whoever +whole +whom +whose +why +will +with +within +without +would +yet +you +your +yours +yourself +yourselves diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/finnish.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/finnish.txt new file mode 100644 index 0000000000000..47ee200f6781d --- /dev/null +++ b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/finnish.txt @@ -0,0 +1,235 @@ +olla +olen +olet +on +olemme +olette +ovat +ole +oli +olisi +olisit +olisin +olisimme +olisitte +olisivat +olit +olin +olimme +olitte +olivat +ollut +olleet +en +et +ei +emme +ette +eivät +minä +minun +minut +minua +minussa +minusta +minuun +minulla +minulta +minulle +sinä +sinun +sinut +sinua +sinussa +sinusta +sinuun +sinulla +sinulta +sinulle +hän +hänen +hänet +häntä +hänessä +hänestä +häneen +hänellä +häneltä +hänelle +me +meidän +meidät +meitä +meissä +meistä +meihin +meillä +meiltä +meille +te +teidän +teidät +teitä +teissä +teistä +teihin +teillä +teiltä +teille +he +heidän +heidät +heitä +heissä +heistä +heihin +heillä +heiltä +heille +tämä +tämän +tätä +tässä +tästä +tähän +tallä +tältä +tälle +tänä +täksi +tuo +tuon +tuotä +tuossa +tuosta +tuohon +tuolla +tuolta +tuolle +tuona +tuoksi +se +sen +sitä +siinä +siitä +siihen +sillä +siltä +sille +sinä +siksi +nämä +näiden +näitä +näissä +näistä +näihin +näillä +näiltä +näille +näinä +näiksi +nuo +noiden +noita +noissa +noista +noihin +noilla +noilta +noille +noina +noiksi +ne +niiden +niitä +niissä +niistä +niihin +niillä +niiltä +niille +niinä +niiksi +kuka +kenen +kenet +ketä +kenessä +kenestä +keneen +kenellä +keneltä +kenelle +kenenä +keneksi +ketkä +keiden +ketkä +keitä +keissä +keistä +keihin +keillä +keiltä +keille +keinä +keiksi +mikä +minkä +minkä +mitä +missä +mistä +mihin +millä +miltä +mille +minä +miksi +mitkä +joka +jonka +jota +jossa +josta +johon +jolla +jolta +jolle +jona +joksi +jotka +joiden +joita +joissa +joista +joihin +joilla +joilta +joille +joina +joiksi +että +ja +jos +koska +kuin +mutta +niin +sekä +sillä +tai +vaan +vai +vaikka +kanssa +mukaan +noin +poikki +yli +kun +niin +nyt +itse diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/french.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/french.txt new file mode 100644 index 0000000000000..e7cbf4c975001 --- /dev/null +++ b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/french.txt @@ -0,0 +1,155 @@ +au +aux +avec +ce +ces +dans +de +des +du +elle +en +et +eux +il +je +la +le +leur +lui +ma +mais +me +même +mes +moi +mon +ne +nos +notre +nous +on +ou +par +pas +pour +qu +que +qui +sa +se +ses +son +sur +ta +te +tes +toi +ton +tu +un +une +vos +votre +vous +c +d +j +l +à +m +n +s +t +y +été +étée +étées +étés +étant +étante +étants +étantes +suis +es +est +sommes +êtes +sont +serai +seras +sera +serons +serez +seront +serais +serait +serions +seriez +seraient +étais +était +étions +étiez +étaient +fus +fut +fûmes +fûtes +furent +sois +soit +soyons +soyez +soient +fusse +fusses +fût +fussions +fussiez +fussent +ayant +ayante +ayantes +ayants +eu +eue +eues +eus +ai +as +avons +avez +ont +aurai +auras +aura +aurons +aurez +auront +aurais +aurait +aurions +auriez +auraient +avais +avait +avions +aviez +avaient +eut +eûmes +eûtes +eurent +aie +aies +ait +ayons +ayez +aient +eusse +eusses +eût +eussions +eussiez +eussent diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/german.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/german.txt new file mode 100644 index 0000000000000..edef220b7a7da --- /dev/null +++ b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/german.txt @@ -0,0 +1,231 @@ +aber +alle +allem +allen +aller +alles +als +also +am +an +ander +andere +anderem +anderen +anderer +anderes +anderm +andern +anderr +anders +auch +auf +aus +bei +bin +bis +bist +da +damit +dann +der +den +des +dem +die +das +daß +derselbe +derselben +denselben +desselben +demselben +dieselbe +dieselben +dasselbe +dazu +dein +deine +deinem +deinen +deiner +deines +denn +derer +dessen +dich +dir +du +dies +diese +diesem +diesen +dieser +dieses +doch +dort +durch +ein +eine +einem +einen +einer +eines +einig +einige +einigem +einigen +einiger +einiges +einmal +er +ihn +ihm +es +etwas +euer +eure +eurem +euren +eurer +eures +für +gegen +gewesen +hab +habe +haben +hat +hatte +hatten +hier +hin +hinter +ich +mich +mir +ihr +ihre +ihrem +ihren +ihrer +ihres +euch +im +in +indem +ins +ist +jede +jedem +jeden +jeder +jedes +jene +jenem +jenen +jener +jenes +jetzt +kann +kein +keine +keinem +keinen +keiner +keines +können +könnte +machen +man +manche +manchem +manchen +mancher +manches +mein +meine +meinem +meinen +meiner +meines +mit +muss +musste +nach +nicht +nichts +noch +nun +nur +ob +oder +ohne +sehr +sein +seine +seinem +seinen +seiner +seines +selbst +sich +sie +ihnen +sind +so +solche +solchem +solchen +solcher +solches +soll +sollte +sondern +sonst +über +um +und +uns +unse +unsem +unsen +unser +unses +unter +viel +vom +von +vor +während +war +waren +warst +was +weg +weil +weiter +welche +welchem +welchen +welcher +welches +wenn +werde +werden +wie +wieder +will +wir +wird +wirst +wo +wollen +wollte +würde +würden +zu +zum +zur +zwar +zwischen diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/hungarian.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/hungarian.txt new file mode 100644 index 0000000000000..94e9f9a0b07a6 --- /dev/null +++ b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/hungarian.txt @@ -0,0 +1,199 @@ +a +ahogy +ahol +aki +akik +akkor +alatt +által +általában +amely +amelyek +amelyekben +amelyeket +amelyet +amelynek +ami +amit +amolyan +amíg +amikor +át +abban +ahhoz +annak +arra +arról +az +azok +azon +azt +azzal +azért +aztán +azután +azonban +bár +be +belül +benne +cikk +cikkek +cikkeket +csak +de +e +eddig +egész +egy +egyes +egyetlen +egyéb +egyik +egyre +ekkor +el +elég +ellen +elõ +elõször +elõtt +elsõ +én +éppen +ebben +ehhez +emilyen +ennek +erre +ez +ezt +ezek +ezen +ezzel +ezért +és +fel +felé +hanem +hiszen +hogy +hogyan +igen +így +illetve +ill. +ill +ilyen +ilyenkor +ison +ismét +itt +jó +jól +jobban +kell +kellett +keresztül +keressünk +ki +kívül +között +közül +legalább +lehet +lehetett +legyen +lenne +lenni +lesz +lett +maga +magát +majd +majd +már +más +másik +meg +még +mellett +mert +mely +melyek +mi +mit +míg +miért +milyen +mikor +minden +mindent +mindenki +mindig +mint +mintha +mivel +most +nagy +nagyobb +nagyon +ne +néha +nekem +neki +nem +néhány +nélkül +nincs +olyan +ott +össze +õ +õk +õket +pedig +persze +rá +s +saját +sem +semmi +sok +sokat +sokkal +számára +szemben +szerint +szinte +talán +tehát +teljes +tovább +továbbá +több +úgy +ugyanis +új +újabb +újra +után +utána +utolsó +vagy +vagyis +valaki +valami +valamint +való +vagyok +van +vannak +volt +voltam +voltak +voltunk +vissza +vele +viszont +volna diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/italian.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/italian.txt new file mode 100644 index 0000000000000..6ee02b51fb171 --- /dev/null +++ b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/italian.txt @@ -0,0 +1,279 @@ +ad +al +allo +ai +agli +all +agl +alla +alle +con +col +coi +da +dal +dallo +dai +dagli +dall +dagl +dalla +dalle +di +del +dello +dei +degli +dell +degl +della +delle +in +nel +nello +nei +negli +nell +negl +nella +nelle +su +sul +sullo +sui +sugli +sull +sugl +sulla +sulle +per +tra +contro +io +tu +lui +lei +noi +voi +loro +mio +mia +miei +mie +tuo +tua +tuoi +tue +suo +sua +suoi +sue +nostro +nostra +nostri +nostre +vostro +vostra +vostri +vostre +mi +ti +ci +vi +lo +la +li +le +gli +ne +il +un +uno +una +ma +ed +se +perché +anche +come +dov +dove +che +chi +cui +non +più +quale +quanto +quanti +quanta +quante +quello +quelli +quella +quelle +questo +questi +questa +queste +si +tutto +tutti +a +c +e +i +l +o +ho +hai +ha +abbiamo +avete +hanno +abbia +abbiate +abbiano +avrò +avrai +avrà +avremo +avrete +avranno +avrei +avresti +avrebbe +avremmo +avreste +avrebbero +avevo +avevi +aveva +avevamo +avevate +avevano +ebbi +avesti +ebbe +avemmo +aveste +ebbero +avessi +avesse +avessimo +avessero +avendo +avuto +avuta +avuti +avute +sono +sei +è +siamo +siete +sia +siate +siano +sarò +sarai +sarà +saremo +sarete +saranno +sarei +saresti +sarebbe +saremmo +sareste +sarebbero +ero +eri +era +eravamo +eravate +erano +fui +fosti +fu +fummo +foste +furono +fossi +fosse +fossimo +fossero +essendo +faccio +fai +facciamo +fanno +faccia +facciate +facciano +farò +farai +farà +faremo +farete +faranno +farei +faresti +farebbe +faremmo +fareste +farebbero +facevo +facevi +faceva +facevamo +facevate +facevano +feci +facesti +fece +facemmo +faceste +fecero +facessi +facesse +facessimo +facessero +facendo +sto +stai +sta +stiamo +stanno +stia +stiate +stiano +starò +starai +starà +staremo +starete +staranno +starei +staresti +starebbe +staremmo +stareste +starebbero +stavo +stavi +stava +stavamo +stavate +stavano +stetti +stesti +stette +stemmo +steste +stettero +stessi +stesse +stessimo +stessero +stando diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/norwegian.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/norwegian.txt new file mode 100644 index 0000000000000..9ac1abbb6cba1 --- /dev/null +++ b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/norwegian.txt @@ -0,0 +1,176 @@ +og +i +jeg +det +at +en +et +den +til +er +som +på +de +med +han +av +ikke +ikkje +der +så +var +meg +seg +men +ett +har +om +vi +min +mitt +ha +hadde +hun +nå +over +da +ved +fra +du +ut +sin +dem +oss +opp +man +kan +hans +hvor +eller +hva +skal +selv +sjøl +her +alle +vil +bli +ble +blei +blitt +kunne +inn +når +være +kom +noen +noe +ville +dere +som +deres +kun +ja +etter +ned +skulle +denne +for +deg +si +sine +sitt +mot +å +meget +hvorfor +dette +disse +uten +hvordan +ingen +din +ditt +blir +samme +hvilken +hvilke +sånn +inni +mellom +vår +hver +hvem +vors +hvis +både +bare +enn +fordi +før +mange +også +slik +vært +være +båe +begge +siden +dykk +dykkar +dei +deira +deires +deim +di +då +eg +ein +eit +eitt +elles +honom +hjå +ho +hoe +henne +hennar +hennes +hoss +hossen +ikkje +ingi +inkje +korleis +korso +kva +kvar +kvarhelst +kven +kvi +kvifor +me +medan +mi +mine +mykje +no +nokon +noka +nokor +noko +nokre +si +sia +sidan +so +somt +somme +um +upp +vere +vore +verte +vort +varte +vart diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/portuguese.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/portuguese.txt new file mode 100644 index 0000000000000..6b2477863b7bb --- /dev/null +++ b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/portuguese.txt @@ -0,0 +1,203 @@ +de +a +o +que +e +do +da +em +um +para +com +não +uma +os +no +se +na +por +mais +as +dos +como +mas +ao +ele +das +à +seu +sua +ou +quando +muito +nos +já +eu +também +só +pelo +pela +até +isso +ela +entre +depois +sem +mesmo +aos +seus +quem +nas +me +esse +eles +você +essa +num +nem +suas +meu +às +minha +numa +pelos +elas +qual +nós +lhe +deles +essas +esses +pelas +este +dele +tu +te +vocês +vos +lhes +meus +minhas +teu +tua +teus +tuas +nosso +nossa +nossos +nossas +dela +delas +esta +estes +estas +aquele +aquela +aqueles +aquelas +isto +aquilo +estou +está +estamos +estão +estive +esteve +estivemos +estiveram +estava +estávamos +estavam +estivera +estivéramos +esteja +estejamos +estejam +estivesse +estivéssemos +estivessem +estiver +estivermos +estiverem +hei +há +havemos +hão +houve +houvemos +houveram +houvera +houvéramos +haja +hajamos +hajam +houvesse +houvéssemos +houvessem +houver +houvermos +houverem +houverei +houverá +houveremos +houverão +houveria +houveríamos +houveriam +sou +somos +são +era +éramos +eram +fui +foi +fomos +foram +fora +fôramos +seja +sejamos +sejam +fosse +fôssemos +fossem +for +formos +forem +serei +será +seremos +serão +seria +seríamos +seriam +tenho +tem +temos +tém +tinha +tínhamos +tinham +tive +teve +tivemos +tiveram +tivera +tivéramos +tenha +tenhamos +tenham +tivesse +tivéssemos +tivessem +tiver +tivermos +tiverem +terei +terá +teremos +terão +teria +teríamos +teriam diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/russian.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/russian.txt new file mode 100644 index 0000000000000..ecb83d4a7f393 --- /dev/null +++ b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/russian.txt @@ -0,0 +1,151 @@ +и +в +во +не +что +он +на +я +с +со +как +а +то +все +она +так +его +но +да +ты +к +у +же +вы +за +бы +по +только +ее +мне +было +вот +от +меня +еще +нет +о +из +ему +теперь +когда +даже +ну +вдруг +ли +если +уже +или +ни +быть +был +него +до +вас +нибудь +опять +уж +вам +ведь +там +потом +себя +ничего +ей +может +они +тут +где +есть +надо +ней +для +мы +тебя +их +чем +была +сам +чтоб +без +будто +чего +раз +тоже +себе +под +будет +ж +тогда +кто +этот +того +потому +этого +какой +совсем +ним +здесь +этом +один +почти +мой +тем +чтобы +нее +сейчас +были +куда +зачем +всех +никогда +можно +при +наконец +два +об +другой +хоть +после +над +больше +тот +через +эти +нас +про +всего +них +какая +много +разве +три +эту +моя +впрочем +хорошо +свою +этой +перед +иногда +лучше +чуть +том +нельзя +такой +им +более +всегда +конечно +всю +между diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/spanish.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/spanish.txt new file mode 100644 index 0000000000000..59bc786caa490 --- /dev/null +++ b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/spanish.txt @@ -0,0 +1,313 @@ +de +la +que +el +en +y +a +los +del +se +las +por +un +para +con +no +una +su +al +lo +como +más +pero +sus +le +ya +o +este +sí +porque +esta +entre +cuando +muy +sin +sobre +también +me +hasta +hay +donde +quien +desde +todo +nos +durante +todos +uno +les +ni +contra +otros +ese +eso +ante +ellos +e +esto +mí +antes +algunos +qué +unos +yo +otro +otras +otra +él +tanto +esa +estos +mucho +quienes +nada +muchos +cual +poco +ella +estar +estas +algunas +algo +nosotros +mi +mis +tú +te +ti +tu +tus +ellas +nosotras +vosostros +vosostras +os +mío +mía +míos +mías +tuyo +tuya +tuyos +tuyas +suyo +suya +suyos +suyas +nuestro +nuestra +nuestros +nuestras +vuestro +vuestra +vuestros +vuestras +esos +esas +estoy +estás +está +estamos +estáis +están +esté +estés +estemos +estéis +estén +estaré +estarás +estará +estaremos +estaréis +estarán +estaría +estarías +estaríamos +estaríais +estarían +estaba +estabas +estábamos +estabais +estaban +estuve +estuviste +estuvo +estuvimos +estuvisteis +estuvieron +estuviera +estuvieras +estuviéramos +estuvierais +estuvieran +estuviese +estuvieses +estuviésemos +estuvieseis +estuviesen +estando +estado +estada +estados +estadas +estad +he +has +ha +hemos +habéis +han +haya +hayas +hayamos +hayáis +hayan +habré +habrás +habrá +habremos +habréis +habrán +habría +habrías +habríamos +habríais +habrían +había +habías +habíamos +habíais +habían +hube +hubiste +hubo +hubimos +hubisteis +hubieron +hubiera +hubieras +hubiéramos +hubierais +hubieran +hubiese +hubieses +hubiésemos +hubieseis +hubiesen +habiendo +habido +habida +habidos +habidas +soy +eres +es +somos +sois +son +sea +seas +seamos +seáis +sean +seré +serás +será +seremos +seréis +serán +sería +serías +seríamos +seríais +serían +era +eras +éramos +erais +eran +fui +fuiste +fue +fuimos +fuisteis +fueron +fuera +fueras +fuéramos +fuerais +fueran +fuese +fueses +fuésemos +fueseis +fuesen +sintiendo +sentido +sentida +sentidos +sentidas +siente +sentid +tengo +tienes +tiene +tenemos +tenéis +tienen +tenga +tengas +tengamos +tengáis +tengan +tendré +tendrás +tendrá +tendremos +tendréis +tendrán +tendría +tendrías +tendríamos +tendríais +tendrían +tenía +tenías +teníamos +teníais +tenían +tuve +tuviste +tuvo +tuvimos +tuvisteis +tuvieron +tuviera +tuvieras +tuviéramos +tuvierais +tuvieran +tuviese +tuvieses +tuviésemos +tuvieseis +tuviesen +teniendo +tenido +tenida +tenidos +tenidas +tened diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/swedish.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/swedish.txt new file mode 100644 index 0000000000000..742bb6263b99f --- /dev/null +++ b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/swedish.txt @@ -0,0 +1,114 @@ +och +det +att +i +en +jag +hon +som +han +på +den +med +var +sig +för +så +till +är +men +ett +om +hade +de +av +icke +mig +du +henne +då +sin +nu +har +inte +hans +honom +skulle +hennes +där +min +man +ej +vid +kunde +något +från +ut +när +efter +upp +vi +dem +vara +vad +över +än +dig +kan +sina +här +ha +mot +alla +under +någon +eller +allt +mycket +sedan +ju +denna +själv +detta +åt +utan +varit +hur +ingen +mitt +ni +bli +blev +oss +din +dessa +några +deras +blir +mina +samma +vilken +er +sådan +vår +blivit +dess +inom +mellan +sådant +varför +varje +vilka +ditt +vem +vilket +sitta +sådana +vart +dina +vars +vårt +våra +ert +era +vilkas diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/turkish.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/turkish.txt new file mode 100644 index 0000000000000..5a48ccce0737b --- /dev/null +++ b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/turkish.txt @@ -0,0 +1,53 @@ +acaba +ama +aslında +az +bazı +belki +biri +birkaç +birşey +biz +bu +çok +çünkü +da +daha +de +defa +diye +eğer +en +gibi +hem +hep +hepsi +her +hiç +için +ile +ise +kez +ki +kim +mı +mu +mü +nasıl +ne +neden +nerde +nerede +nereye +niçin +niye +o +sanki +şey +siz +şu +tüm +ve +veya +ya +yani From 8248579ec27a40de98fe1f3020d947c478981ebc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Tue, 15 Mar 2016 00:23:32 +0200 Subject: [PATCH 02/22] add multi-language support for stop words --- .../spark/ml/feature/StopWordsRemover.scala | 179 ++++++++++-------- 1 file changed, 105 insertions(+), 74 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala index 0d4c968633295..e7f1d8323376b 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala @@ -19,71 +19,49 @@ package org.apache.spark.ml.feature import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.Transformer -import org.apache.spark.ml.param.{BooleanParam, ParamMap, StringArrayParam} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} +import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap, StringArrayParam} import org.apache.spark.ml.util._ import org.apache.spark.sql.DataFrame -import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StringType, StructType} /** - * stop words list - */ + * stop words list + */ private[spark] object StopWords { - /** - * Use the same default stopwords list as scikit-learn. - * The original list can be found from "Glasgow Information Retrieval Group" - * [[http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words]] - */ - val English = Array( "a", "about", "above", "across", "after", "afterwards", "again", - "against", "all", "almost", "alone", "along", "already", "also", "although", "always", - "am", "among", "amongst", "amoungst", "amount", "an", "and", "another", - "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", - "around", "as", "at", "back", "be", "became", "because", "become", - "becomes", "becoming", "been", "before", "beforehand", "behind", "being", - "below", "beside", "besides", "between", "beyond", "bill", "both", - "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con", - "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", - "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else", - "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", - "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", - "find", "fire", "first", "five", "for", "former", "formerly", "forty", - "found", "four", "from", "front", "full", "further", "get", "give", "go", - "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", - "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", - "how", "however", "hundred", "i", "ie", "if", "in", "inc", "indeed", - "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", - "latterly", "least", "less", "ltd", "made", "many", "may", "me", - "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", - "move", "much", "must", "my", "myself", "name", "namely", "neither", - "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", - "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", - "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", - "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps", - "please", "put", "rather", "re", "same", "see", "seem", "seemed", - "seeming", "seems", "serious", "several", "she", "should", "show", "side", - "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", - "something", "sometime", "sometimes", "somewhere", "still", "such", - "system", "take", "ten", "than", "that", "the", "their", "them", - "themselves", "then", "thence", "there", "thereafter", "thereby", - "therefore", "therein", "thereupon", "these", "they", "thick", "thin", - "third", "this", "those", "though", "three", "through", "throughout", - "thru", "thus", "to", "together", "too", "top", "toward", "towards", - "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", - "very", "via", "was", "we", "well", "were", "what", "whatever", "when", - "whence", "whenever", "where", "whereafter", "whereas", "whereby", - "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", - "who", "whoever", "whole", "whom", "whose", "why", "will", "with", - "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves") + def readStopWords(language: String): Array[String] = { + val is = getClass.getResourceAsStream(s"/$language.txt") + scala.io.Source.fromInputStream(is).getLines().toArray + } + + lazy val Danish = readStopWords("/danish.txt") + lazy val Dutch = readStopWords("/dutch.txt") + lazy val English = readStopWords("/english.txt") + lazy val Finnish = readStopWords("/finnish.txt") + lazy val French = readStopWords("/french.txt") + lazy val German = readStopWords("/german.txt") + lazy val Hungarian = readStopWords("/hungarian.txt") + lazy val Italian = readStopWords("/italian.txt") + lazy val Norwegian = readStopWords("/norwegian.txt") + lazy val Portuguese = readStopWords("/portuguese.txt") + lazy val Russian = readStopWords("/russian.txt") + lazy val Spanish = readStopWords("/spanish.txt") + lazy val Swedish = readStopWords("/swedish.txt") + lazy val Turkish = readStopWords("/turkish.txt") + + val languageMap = Map("danish" -> Danish, "dutch" -> Dutch, "english" -> English, + "finnish" -> Finnish, "french" -> French, "german" -> German, "hungarian" -> Hungarian, + "italian" -> Italian, "norwegian" -> Norwegian, "portuguese" -> Portuguese, + "russian" -> Russian, "spanish" -> Spanish, "swedish" -> Swedish, "turkish" -> Turkish) } /** - * :: Experimental :: - * A feature transformer that filters out stop words from input. - * Note: null values from input array are preserved unless adding null to stopWords explicitly. - * @see [[http://en.wikipedia.org/wiki/Stop_words]] - */ + * :: Experimental :: + * A feature transformer that filters out stop words from input. + * Note: null values from input array are preserved unless adding null to stopWords explicitly. + * @see [[http://en.wikipedia.org/wiki/Stop_words]] + */ @Experimental class StopWordsRemover(override val uid: String) extends Transformer with HasInputCol with HasOutputCol with DefaultParamsWritable { @@ -97,23 +75,26 @@ class StopWordsRemover(override val uid: String) def setOutputCol(value: String): this.type = set(outputCol, value) /** - * the stop words set to be filtered out - * Default: [[StopWords.English]] - * @group param - */ + * the stop words set to be filtered out + * Default: [[StopWords.English]] + * @group param + */ val stopWords: StringArrayParam = new StringArrayParam(this, "stopWords", "stop words") /** @group setParam */ - def setStopWords(value: Array[String]): this.type = set(stopWords, value) + def setStopWords(value: Array[String]): this.type = { + set(stopWords, value) + set(language, "unknown") + } /** @group getParam */ def getStopWords: Array[String] = $(stopWords) /** - * whether to do a case sensitive comparison over the stop words - * Default: false - * @group param - */ + * whether to do a case sensitive comparison over the stop words + * Default: false + * @group param + */ val caseSensitive: BooleanParam = new BooleanParam(this, "caseSensitive", "whether to do case-sensitive comparison during filtering") @@ -123,21 +104,71 @@ class StopWordsRemover(override val uid: String) /** @group getParam */ def getCaseSensitive: Boolean = $(caseSensitive) - setDefault(stopWords -> StopWords.English, caseSensitive -> false) + /** + * the language of stop words + * Default: "english" + * @group param + */ + val language: Param[String] = new Param[String](this, "language", "stopwords language") + + /** @group setParam */ + def setLanguage(value: String): this.type = { + val lang = value.toLowerCase + require(StopWords.languageMap.contains(lang), s"$lang is not in language list") + set(language, lang) + set(stopWords, StopWords.languageMap(lang)) + } + + /** @group getParam */ + def getLanguage: String = $(language) + + /** + * the ignored stop words set to be ignored out + * Default: [[Array.empty]] + * @group param + */ + val ignoredWords: StringArrayParam = new StringArrayParam(this, "ignoredWords", + "the ignored stop words set to be ignored out") + + /** @group setParam */ + def setIgnoredWords(value: Array[String]): this.type = set(ignoredWords, value) + + /** @group getParam */ + def getIgnoredWords: Array[String] = $(ignoredWords) + + /** + * the additional stop words set to be filtered out + * Default: [[Array.empty]] + * @group param + */ + val additionalWords: StringArrayParam = new StringArrayParam(this, "additionalWords", + "the additional stop words set to be filtered out") + + /** @group setParam */ + def setAdditionalWords(value: Array[String]): this.type = set(additionalWords, value) + + /** @group getParam */ + def getAdditionalWords: Array[String] = $(additionalWords) + + setDefault(stopWords -> StopWords.English, language -> "en", ignoredWords -> Array.empty[String] + , additionalWords -> Array.empty[String], caseSensitive -> false) override def transform(dataset: DataFrame): DataFrame = { val outputSchema = transformSchema(dataset.schema) val t = if ($(caseSensitive)) { - val stopWordsSet = $(stopWords).toSet - udf { terms: Seq[String] => - terms.filter(s => !stopWordsSet.contains(s)) - } - } else { - val toLower = (s: String) => if (s != null) s.toLowerCase else s - val lowerStopWords = $(stopWords).map(toLower(_)).toSet - udf { terms: Seq[String] => - terms.filter(s => !lowerStopWords.contains(toLower(s))) - } + val stopWordsSet = ($(stopWords) ++ $(additionalWords)).toSet - $(ignoredWords).toSet + udf { terms: Seq[String] => + terms.filter(s => !stopWordsSet.contains(s)) + } + } else { + val toLower = (s: String) => if (s != null) s.toLowerCase else s + val lowerStopWords = { + ($(stopWords) ++ $(additionalWords)) + .map(toLower(_)).toSet - $(ignoredWords).map(toLower(_)).toSet + } + udf { terms: Seq[String] => + terms.filter(s => !lowerStopWords.contains(toLower(s))) + } } val metadata = outputSchema($(outputCol)).metadata From 2c7b73df14d2d292eff88d7f3c358d29f82f6122 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Tue, 15 Mar 2016 00:24:41 +0200 Subject: [PATCH 03/22] add new tests for StopWordsRemover --- .../ml/feature/StopWordsRemoverSuite.scala | 31 +++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala index a5b24c18565b9..92c177ad68619 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala @@ -67,12 +67,26 @@ class StopWordsRemoverSuite testStopWordsRemover(remover, dataSet) } + test("StopWordsRemover with ignored words") { + val ignoredWords = Array("a") + val remover = new StopWordsRemover() + .setInputCol("raw") + .setOutputCol("filtered") + .setIgnoredWords(ignoredWords) + val dataSet = sqlContext.createDataFrame(Seq( + (Seq("python", "scala", "a"), Seq("python", "scala", "a")), + (Seq("Python", "Scala", "swift"), Seq("Python", "Scala", "swift")) + )).toDF("raw", "expected") + + testStopWordsRemover(remover, dataSet) + } + test("StopWordsRemover with additional words") { - val stopWords = StopWords.English ++ Array("python", "scala") + val additionalWords = Array("python", "scala") val remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol("filtered") - .setStopWords(stopWords) + .setAdditionalWords(additionalWords) val dataSet = sqlContext.createDataFrame(Seq( (Seq("python", "scala", "a"), Seq()), (Seq("Python", "Scala", "swift"), Seq("swift")) @@ -81,6 +95,19 @@ class StopWordsRemoverSuite testStopWordsRemover(remover, dataSet) } + test("StopWordsRemover with language selection") { + val remover = new StopWordsRemover() + .setInputCol("raw") + .setOutputCol("filtered") + .setLanguage("turkish") + val dataSet = sqlContext.createDataFrame(Seq( + (Seq("acaba", "ama", "biri"), Seq()), + (Seq("hep", "her", "scala"), Seq("scala")) + )).toDF("raw", "expected") + + testStopWordsRemover(remover, dataSet) + } + test("read/write") { val t = new StopWordsRemover() .setInputCol("myInputCol") From 43e5cf54d4f9583f8b90291b3c7603ac4e7fab2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Tue, 22 Mar 2016 01:41:47 +0200 Subject: [PATCH 04/22] adjust resource files --- .../spark/ml/feature/stopwords/danish.txt | 94 ++++++ .../spark/ml/feature/stopwords/dutch.txt | 101 ++++++ .../spark/ml/feature/stopwords/english.txt | 319 ++++++++++++++++++ .../spark/ml/feature/stopwords/finnish.txt | 235 +++++++++++++ .../spark/ml/feature/stopwords/french.txt | 155 +++++++++ .../spark/ml/feature/stopwords/german.txt | 231 +++++++++++++ .../spark/ml/feature/stopwords/hungarian.txt | 199 +++++++++++ .../spark/ml/feature/stopwords/italian.txt | 279 +++++++++++++++ .../spark/ml/feature/stopwords/norwegian.txt | 176 ++++++++++ .../spark/ml/feature/stopwords/portuguese.txt | 203 +++++++++++ .../spark/ml/feature/stopwords/russian.txt | 151 +++++++++ .../spark/ml/feature/stopwords/spanish.txt | 313 +++++++++++++++++ .../spark/ml/feature/stopwords/swedish.txt | 114 +++++++ .../spark/ml/feature/stopwords/turkish.txt | 53 +++ 14 files changed, 2623 insertions(+) create mode 100644 mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/danish.txt create mode 100644 mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/dutch.txt create mode 100644 mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/english.txt create mode 100644 mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/finnish.txt create mode 100644 mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/french.txt create mode 100644 mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/german.txt create mode 100644 mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/hungarian.txt create mode 100644 mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/italian.txt create mode 100644 mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/norwegian.txt create mode 100644 mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/portuguese.txt create mode 100644 mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/russian.txt create mode 100644 mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/spanish.txt create mode 100644 mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/swedish.txt create mode 100644 mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/turkish.txt diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/danish.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/danish.txt new file mode 100644 index 0000000000000..d3edc6757912e --- /dev/null +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/danish.txt @@ -0,0 +1,94 @@ +og +i +jeg +det +at +en +den +til +er +som +på +de +med +han +af +for +ikke +der +var +mig +sig +men +et +har +om +vi +min +havde +ham +hun +nu +over +da +fra +du +ud +sin +dem +os +op +man +hans +hvor +eller +hvad +skal +selv +her +alle +vil +blev +kunne +ind +når +være +dog +noget +ville +jo +deres +efter +ned +skulle +denne +end +dette +mit +også +under +have +dig +anden +hende +mine +alt +meget +sit +sine +vor +mod +disse +hvis +din +nogle +hos +blive +mange +ad +bliver +hendes +været +thi +jer +sådan diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/dutch.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/dutch.txt new file mode 100644 index 0000000000000..cafa0324b5376 --- /dev/null +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/dutch.txt @@ -0,0 +1,101 @@ +de +en +van +ik +te +dat +die +in +een +hij +het +niet +zijn +is +was +op +aan +met +als +voor +had +er +maar +om +hem +dan +zou +of +wat +mijn +men +dit +zo +door +over +ze +zich +bij +ook +tot +je +mij +uit +der +daar +haar +naar +heb +hoe +heeft +hebben +deze +u +want +nog +zal +me +zij +nu +ge +geen +omdat +iets +worden +toch +al +waren +veel +meer +doen +toen +moet +ben +zonder +kan +hun +dus +alles +onder +ja +eens +hier +wie +werd +altijd +doch +wordt +wezen +kunnen +ons +zelf +tegen +na +reeds +wil +kon +niets +uw +iemand +geweest +andere diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/english.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/english.txt new file mode 100644 index 0000000000000..61e5350dcde39 --- /dev/null +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/english.txt @@ -0,0 +1,319 @@ +a +about +above +across +after +afterwards +again +against +all +almost +alone +along +already +also +although +always +am +among +amongst +amoungst +amount +an +and +another +any +anyhow +anyone +anything +anyway +anywhere +are +around +as +at +back +be +became +because +become +becomes +becoming +been +before +beforehand +behind +being +below +beside +besides +between +beyond +bill +both +bottom +but +by +call +can +cannot +cant +co +computer +con +could +couldnt +cry +de +describe +detail +do +done +down +due +during +each +eg +eight +either +eleven +else +elsewhere +empty +enough +etc +even +ever +every +everyone +everything +everywhere +except +few +fifteen +fify +fill +find +fire +first +five +for +former +formerly +forty +found +four +from +front +full +further +get +give +go +had +has +hasnt +have +he +hence +her +here +hereafter +hereby +herein +hereupon +hers +herself +him +himself +his +how +however +hundred +i +ie +if +in +inc +indeed +interest +into +is +it +its +itself +keep +last +latter +latterly +least +less +ltd +made +many +may +me +meanwhile +might +mill +mine +more +moreover +most +mostly +move +much +must +my +myself +name +namely +neither +never +nevertheless +next +nine +no +nobody +none +noone +nor +not +nothing +now +nowhere +of +off +often +on +once +one +only +onto +or +other +others +otherwise +our +ours +ourselves +out +over +own +part +per +perhaps +please +put +rather +re +same +see +seem +seemed +seeming +seems +serious +several +she +should +show +side +since +sincere +six +sixty +so +some +somehow +someone +something +sometime +sometimes +somewhere +still +such +system +take +ten +than +that +the +their +them +themselves +then +thence +there +thereafter +thereby +therefore +therein +thereupon +these +they +thick +thin +third +this +those +though +three +through +throughout +thru +thus +to +together +too +top +toward +towards +twelve +twenty +two +un +under +until +up +upon +us +very +via +was +we +well +were +what +whatever +when +whence +whenever +where +whereafter +whereas +whereby +wherein +whereupon +wherever +whether +which +while +whither +who +whoever +whole +whom +whose +why +will +with +within +without +would +yet +you +your +yours +yourself +yourselves diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/finnish.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/finnish.txt new file mode 100644 index 0000000000000..47ee200f6781d --- /dev/null +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/finnish.txt @@ -0,0 +1,235 @@ +olla +olen +olet +on +olemme +olette +ovat +ole +oli +olisi +olisit +olisin +olisimme +olisitte +olisivat +olit +olin +olimme +olitte +olivat +ollut +olleet +en +et +ei +emme +ette +eivät +minä +minun +minut +minua +minussa +minusta +minuun +minulla +minulta +minulle +sinä +sinun +sinut +sinua +sinussa +sinusta +sinuun +sinulla +sinulta +sinulle +hän +hänen +hänet +häntä +hänessä +hänestä +häneen +hänellä +häneltä +hänelle +me +meidän +meidät +meitä +meissä +meistä +meihin +meillä +meiltä +meille +te +teidän +teidät +teitä +teissä +teistä +teihin +teillä +teiltä +teille +he +heidän +heidät +heitä +heissä +heistä +heihin +heillä +heiltä +heille +tämä +tämän +tätä +tässä +tästä +tähän +tallä +tältä +tälle +tänä +täksi +tuo +tuon +tuotä +tuossa +tuosta +tuohon +tuolla +tuolta +tuolle +tuona +tuoksi +se +sen +sitä +siinä +siitä +siihen +sillä +siltä +sille +sinä +siksi +nämä +näiden +näitä +näissä +näistä +näihin +näillä +näiltä +näille +näinä +näiksi +nuo +noiden +noita +noissa +noista +noihin +noilla +noilta +noille +noina +noiksi +ne +niiden +niitä +niissä +niistä +niihin +niillä +niiltä +niille +niinä +niiksi +kuka +kenen +kenet +ketä +kenessä +kenestä +keneen +kenellä +keneltä +kenelle +kenenä +keneksi +ketkä +keiden +ketkä +keitä +keissä +keistä +keihin +keillä +keiltä +keille +keinä +keiksi +mikä +minkä +minkä +mitä +missä +mistä +mihin +millä +miltä +mille +minä +miksi +mitkä +joka +jonka +jota +jossa +josta +johon +jolla +jolta +jolle +jona +joksi +jotka +joiden +joita +joissa +joista +joihin +joilla +joilta +joille +joina +joiksi +että +ja +jos +koska +kuin +mutta +niin +sekä +sillä +tai +vaan +vai +vaikka +kanssa +mukaan +noin +poikki +yli +kun +niin +nyt +itse diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/french.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/french.txt new file mode 100644 index 0000000000000..e7cbf4c975001 --- /dev/null +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/french.txt @@ -0,0 +1,155 @@ +au +aux +avec +ce +ces +dans +de +des +du +elle +en +et +eux +il +je +la +le +leur +lui +ma +mais +me +même +mes +moi +mon +ne +nos +notre +nous +on +ou +par +pas +pour +qu +que +qui +sa +se +ses +son +sur +ta +te +tes +toi +ton +tu +un +une +vos +votre +vous +c +d +j +l +à +m +n +s +t +y +été +étée +étées +étés +étant +étante +étants +étantes +suis +es +est +sommes +êtes +sont +serai +seras +sera +serons +serez +seront +serais +serait +serions +seriez +seraient +étais +était +étions +étiez +étaient +fus +fut +fûmes +fûtes +furent +sois +soit +soyons +soyez +soient +fusse +fusses +fût +fussions +fussiez +fussent +ayant +ayante +ayantes +ayants +eu +eue +eues +eus +ai +as +avons +avez +ont +aurai +auras +aura +aurons +aurez +auront +aurais +aurait +aurions +auriez +auraient +avais +avait +avions +aviez +avaient +eut +eûmes +eûtes +eurent +aie +aies +ait +ayons +ayez +aient +eusse +eusses +eût +eussions +eussiez +eussent diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/german.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/german.txt new file mode 100644 index 0000000000000..edef220b7a7da --- /dev/null +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/german.txt @@ -0,0 +1,231 @@ +aber +alle +allem +allen +aller +alles +als +also +am +an +ander +andere +anderem +anderen +anderer +anderes +anderm +andern +anderr +anders +auch +auf +aus +bei +bin +bis +bist +da +damit +dann +der +den +des +dem +die +das +daß +derselbe +derselben +denselben +desselben +demselben +dieselbe +dieselben +dasselbe +dazu +dein +deine +deinem +deinen +deiner +deines +denn +derer +dessen +dich +dir +du +dies +diese +diesem +diesen +dieser +dieses +doch +dort +durch +ein +eine +einem +einen +einer +eines +einig +einige +einigem +einigen +einiger +einiges +einmal +er +ihn +ihm +es +etwas +euer +eure +eurem +euren +eurer +eures +für +gegen +gewesen +hab +habe +haben +hat +hatte +hatten +hier +hin +hinter +ich +mich +mir +ihr +ihre +ihrem +ihren +ihrer +ihres +euch +im +in +indem +ins +ist +jede +jedem +jeden +jeder +jedes +jene +jenem +jenen +jener +jenes +jetzt +kann +kein +keine +keinem +keinen +keiner +keines +können +könnte +machen +man +manche +manchem +manchen +mancher +manches +mein +meine +meinem +meinen +meiner +meines +mit +muss +musste +nach +nicht +nichts +noch +nun +nur +ob +oder +ohne +sehr +sein +seine +seinem +seinen +seiner +seines +selbst +sich +sie +ihnen +sind +so +solche +solchem +solchen +solcher +solches +soll +sollte +sondern +sonst +über +um +und +uns +unse +unsem +unsen +unser +unses +unter +viel +vom +von +vor +während +war +waren +warst +was +weg +weil +weiter +welche +welchem +welchen +welcher +welches +wenn +werde +werden +wie +wieder +will +wir +wird +wirst +wo +wollen +wollte +würde +würden +zu +zum +zur +zwar +zwischen diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/hungarian.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/hungarian.txt new file mode 100644 index 0000000000000..94e9f9a0b07a6 --- /dev/null +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/hungarian.txt @@ -0,0 +1,199 @@ +a +ahogy +ahol +aki +akik +akkor +alatt +által +általában +amely +amelyek +amelyekben +amelyeket +amelyet +amelynek +ami +amit +amolyan +amíg +amikor +át +abban +ahhoz +annak +arra +arról +az +azok +azon +azt +azzal +azért +aztán +azután +azonban +bár +be +belül +benne +cikk +cikkek +cikkeket +csak +de +e +eddig +egész +egy +egyes +egyetlen +egyéb +egyik +egyre +ekkor +el +elég +ellen +elõ +elõször +elõtt +elsõ +én +éppen +ebben +ehhez +emilyen +ennek +erre +ez +ezt +ezek +ezen +ezzel +ezért +és +fel +felé +hanem +hiszen +hogy +hogyan +igen +így +illetve +ill. +ill +ilyen +ilyenkor +ison +ismét +itt +jó +jól +jobban +kell +kellett +keresztül +keressünk +ki +kívül +között +közül +legalább +lehet +lehetett +legyen +lenne +lenni +lesz +lett +maga +magát +majd +majd +már +más +másik +meg +még +mellett +mert +mely +melyek +mi +mit +míg +miért +milyen +mikor +minden +mindent +mindenki +mindig +mint +mintha +mivel +most +nagy +nagyobb +nagyon +ne +néha +nekem +neki +nem +néhány +nélkül +nincs +olyan +ott +össze +õ +õk +õket +pedig +persze +rá +s +saját +sem +semmi +sok +sokat +sokkal +számára +szemben +szerint +szinte +talán +tehát +teljes +tovább +továbbá +több +úgy +ugyanis +új +újabb +újra +után +utána +utolsó +vagy +vagyis +valaki +valami +valamint +való +vagyok +van +vannak +volt +voltam +voltak +voltunk +vissza +vele +viszont +volna diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/italian.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/italian.txt new file mode 100644 index 0000000000000..6ee02b51fb171 --- /dev/null +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/italian.txt @@ -0,0 +1,279 @@ +ad +al +allo +ai +agli +all +agl +alla +alle +con +col +coi +da +dal +dallo +dai +dagli +dall +dagl +dalla +dalle +di +del +dello +dei +degli +dell +degl +della +delle +in +nel +nello +nei +negli +nell +negl +nella +nelle +su +sul +sullo +sui +sugli +sull +sugl +sulla +sulle +per +tra +contro +io +tu +lui +lei +noi +voi +loro +mio +mia +miei +mie +tuo +tua +tuoi +tue +suo +sua +suoi +sue +nostro +nostra +nostri +nostre +vostro +vostra +vostri +vostre +mi +ti +ci +vi +lo +la +li +le +gli +ne +il +un +uno +una +ma +ed +se +perché +anche +come +dov +dove +che +chi +cui +non +più +quale +quanto +quanti +quanta +quante +quello +quelli +quella +quelle +questo +questi +questa +queste +si +tutto +tutti +a +c +e +i +l +o +ho +hai +ha +abbiamo +avete +hanno +abbia +abbiate +abbiano +avrò +avrai +avrà +avremo +avrete +avranno +avrei +avresti +avrebbe +avremmo +avreste +avrebbero +avevo +avevi +aveva +avevamo +avevate +avevano +ebbi +avesti +ebbe +avemmo +aveste +ebbero +avessi +avesse +avessimo +avessero +avendo +avuto +avuta +avuti +avute +sono +sei +è +siamo +siete +sia +siate +siano +sarò +sarai +sarà +saremo +sarete +saranno +sarei +saresti +sarebbe +saremmo +sareste +sarebbero +ero +eri +era +eravamo +eravate +erano +fui +fosti +fu +fummo +foste +furono +fossi +fosse +fossimo +fossero +essendo +faccio +fai +facciamo +fanno +faccia +facciate +facciano +farò +farai +farà +faremo +farete +faranno +farei +faresti +farebbe +faremmo +fareste +farebbero +facevo +facevi +faceva +facevamo +facevate +facevano +feci +facesti +fece +facemmo +faceste +fecero +facessi +facesse +facessimo +facessero +facendo +sto +stai +sta +stiamo +stanno +stia +stiate +stiano +starò +starai +starà +staremo +starete +staranno +starei +staresti +starebbe +staremmo +stareste +starebbero +stavo +stavi +stava +stavamo +stavate +stavano +stetti +stesti +stette +stemmo +steste +stettero +stessi +stesse +stessimo +stessero +stando diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/norwegian.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/norwegian.txt new file mode 100644 index 0000000000000..9ac1abbb6cba1 --- /dev/null +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/norwegian.txt @@ -0,0 +1,176 @@ +og +i +jeg +det +at +en +et +den +til +er +som +på +de +med +han +av +ikke +ikkje +der +så +var +meg +seg +men +ett +har +om +vi +min +mitt +ha +hadde +hun +nå +over +da +ved +fra +du +ut +sin +dem +oss +opp +man +kan +hans +hvor +eller +hva +skal +selv +sjøl +her +alle +vil +bli +ble +blei +blitt +kunne +inn +når +være +kom +noen +noe +ville +dere +som +deres +kun +ja +etter +ned +skulle +denne +for +deg +si +sine +sitt +mot +å +meget +hvorfor +dette +disse +uten +hvordan +ingen +din +ditt +blir +samme +hvilken +hvilke +sånn +inni +mellom +vår +hver +hvem +vors +hvis +både +bare +enn +fordi +før +mange +også +slik +vært +være +båe +begge +siden +dykk +dykkar +dei +deira +deires +deim +di +då +eg +ein +eit +eitt +elles +honom +hjå +ho +hoe +henne +hennar +hennes +hoss +hossen +ikkje +ingi +inkje +korleis +korso +kva +kvar +kvarhelst +kven +kvi +kvifor +me +medan +mi +mine +mykje +no +nokon +noka +nokor +noko +nokre +si +sia +sidan +so +somt +somme +um +upp +vere +vore +verte +vort +varte +vart diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/portuguese.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/portuguese.txt new file mode 100644 index 0000000000000..6b2477863b7bb --- /dev/null +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/portuguese.txt @@ -0,0 +1,203 @@ +de +a +o +que +e +do +da +em +um +para +com +não +uma +os +no +se +na +por +mais +as +dos +como +mas +ao +ele +das +à +seu +sua +ou +quando +muito +nos +já +eu +também +só +pelo +pela +até +isso +ela +entre +depois +sem +mesmo +aos +seus +quem +nas +me +esse +eles +você +essa +num +nem +suas +meu +às +minha +numa +pelos +elas +qual +nós +lhe +deles +essas +esses +pelas +este +dele +tu +te +vocês +vos +lhes +meus +minhas +teu +tua +teus +tuas +nosso +nossa +nossos +nossas +dela +delas +esta +estes +estas +aquele +aquela +aqueles +aquelas +isto +aquilo +estou +está +estamos +estão +estive +esteve +estivemos +estiveram +estava +estávamos +estavam +estivera +estivéramos +esteja +estejamos +estejam +estivesse +estivéssemos +estivessem +estiver +estivermos +estiverem +hei +há +havemos +hão +houve +houvemos +houveram +houvera +houvéramos +haja +hajamos +hajam +houvesse +houvéssemos +houvessem +houver +houvermos +houverem +houverei +houverá +houveremos +houverão +houveria +houveríamos +houveriam +sou +somos +são +era +éramos +eram +fui +foi +fomos +foram +fora +fôramos +seja +sejamos +sejam +fosse +fôssemos +fossem +for +formos +forem +serei +será +seremos +serão +seria +seríamos +seriam +tenho +tem +temos +tém +tinha +tínhamos +tinham +tive +teve +tivemos +tiveram +tivera +tivéramos +tenha +tenhamos +tenham +tivesse +tivéssemos +tivessem +tiver +tivermos +tiverem +terei +terá +teremos +terão +teria +teríamos +teriam diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/russian.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/russian.txt new file mode 100644 index 0000000000000..ecb83d4a7f393 --- /dev/null +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/russian.txt @@ -0,0 +1,151 @@ +и +в +во +не +что +он +на +я +с +со +как +а +то +все +она +так +его +но +да +ты +к +у +же +вы +за +бы +по +только +ее +мне +было +вот +от +меня +еще +нет +о +из +ему +теперь +когда +даже +ну +вдруг +ли +если +уже +или +ни +быть +был +него +до +вас +нибудь +опять +уж +вам +ведь +там +потом +себя +ничего +ей +может +они +тут +где +есть +надо +ней +для +мы +тебя +их +чем +была +сам +чтоб +без +будто +чего +раз +тоже +себе +под +будет +ж +тогда +кто +этот +того +потому +этого +какой +совсем +ним +здесь +этом +один +почти +мой +тем +чтобы +нее +сейчас +были +куда +зачем +всех +никогда +можно +при +наконец +два +об +другой +хоть +после +над +больше +тот +через +эти +нас +про +всего +них +какая +много +разве +три +эту +моя +впрочем +хорошо +свою +этой +перед +иногда +лучше +чуть +том +нельзя +такой +им +более +всегда +конечно +всю +между diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/spanish.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/spanish.txt new file mode 100644 index 0000000000000..59bc786caa490 --- /dev/null +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/spanish.txt @@ -0,0 +1,313 @@ +de +la +que +el +en +y +a +los +del +se +las +por +un +para +con +no +una +su +al +lo +como +más +pero +sus +le +ya +o +este +sí +porque +esta +entre +cuando +muy +sin +sobre +también +me +hasta +hay +donde +quien +desde +todo +nos +durante +todos +uno +les +ni +contra +otros +ese +eso +ante +ellos +e +esto +mí +antes +algunos +qué +unos +yo +otro +otras +otra +él +tanto +esa +estos +mucho +quienes +nada +muchos +cual +poco +ella +estar +estas +algunas +algo +nosotros +mi +mis +tú +te +ti +tu +tus +ellas +nosotras +vosostros +vosostras +os +mío +mía +míos +mías +tuyo +tuya +tuyos +tuyas +suyo +suya +suyos +suyas +nuestro +nuestra +nuestros +nuestras +vuestro +vuestra +vuestros +vuestras +esos +esas +estoy +estás +está +estamos +estáis +están +esté +estés +estemos +estéis +estén +estaré +estarás +estará +estaremos +estaréis +estarán +estaría +estarías +estaríamos +estaríais +estarían +estaba +estabas +estábamos +estabais +estaban +estuve +estuviste +estuvo +estuvimos +estuvisteis +estuvieron +estuviera +estuvieras +estuviéramos +estuvierais +estuvieran +estuviese +estuvieses +estuviésemos +estuvieseis +estuviesen +estando +estado +estada +estados +estadas +estad +he +has +ha +hemos +habéis +han +haya +hayas +hayamos +hayáis +hayan +habré +habrás +habrá +habremos +habréis +habrán +habría +habrías +habríamos +habríais +habrían +había +habías +habíamos +habíais +habían +hube +hubiste +hubo +hubimos +hubisteis +hubieron +hubiera +hubieras +hubiéramos +hubierais +hubieran +hubiese +hubieses +hubiésemos +hubieseis +hubiesen +habiendo +habido +habida +habidos +habidas +soy +eres +es +somos +sois +son +sea +seas +seamos +seáis +sean +seré +serás +será +seremos +seréis +serán +sería +serías +seríamos +seríais +serían +era +eras +éramos +erais +eran +fui +fuiste +fue +fuimos +fuisteis +fueron +fuera +fueras +fuéramos +fuerais +fueran +fuese +fueses +fuésemos +fueseis +fuesen +sintiendo +sentido +sentida +sentidos +sentidas +siente +sentid +tengo +tienes +tiene +tenemos +tenéis +tienen +tenga +tengas +tengamos +tengáis +tengan +tendré +tendrás +tendrá +tendremos +tendréis +tendrán +tendría +tendrías +tendríamos +tendríais +tendrían +tenía +tenías +teníamos +teníais +tenían +tuve +tuviste +tuvo +tuvimos +tuvisteis +tuvieron +tuviera +tuvieras +tuviéramos +tuvierais +tuvieran +tuviese +tuvieses +tuviésemos +tuvieseis +tuviesen +teniendo +tenido +tenida +tenidos +tenidas +tened diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/swedish.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/swedish.txt new file mode 100644 index 0000000000000..742bb6263b99f --- /dev/null +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/swedish.txt @@ -0,0 +1,114 @@ +och +det +att +i +en +jag +hon +som +han +på +den +med +var +sig +för +så +till +är +men +ett +om +hade +de +av +icke +mig +du +henne +då +sin +nu +har +inte +hans +honom +skulle +hennes +där +min +man +ej +vid +kunde +något +från +ut +när +efter +upp +vi +dem +vara +vad +över +än +dig +kan +sina +här +ha +mot +alla +under +någon +eller +allt +mycket +sedan +ju +denna +själv +detta +åt +utan +varit +hur +ingen +mitt +ni +bli +blev +oss +din +dessa +några +deras +blir +mina +samma +vilken +er +sådan +vår +blivit +dess +inom +mellan +sådant +varför +varje +vilka +ditt +vem +vilket +sitta +sådana +vart +dina +vars +vårt +våra +ert +era +vilkas diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/turkish.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/turkish.txt new file mode 100644 index 0000000000000..5a48ccce0737b --- /dev/null +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/turkish.txt @@ -0,0 +1,53 @@ +acaba +ama +aslında +az +bazı +belki +biri +birkaç +birşey +biz +bu +çok +çünkü +da +daha +de +defa +diye +eğer +en +gibi +hem +hep +hepsi +her +hiç +için +ile +ise +kez +ki +kim +mı +mu +mü +nasıl +ne +neden +nerde +nerede +nereye +niçin +niye +o +sanki +şey +siz +şu +tüm +ve +veya +ya +yani From a43039223a28b308ae1c14d33be5e5a1df382ed6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Tue, 22 Mar 2016 01:43:15 +0200 Subject: [PATCH 05/22] adjust resource files --- .../danish.txt | 94 ------ .../dutch.txt | 101 ------ .../english.txt | 319 ------------------ .../finnish.txt | 235 ------------- .../french.txt | 155 --------- .../german.txt | 231 ------------- .../hungarian.txt | 199 ----------- .../italian.txt | 279 --------------- .../norwegian.txt | 176 ---------- .../portuguese.txt | 203 ----------- .../russian.txt | 151 --------- .../spanish.txt | 313 ----------------- .../swedish.txt | 114 ------- .../turkish.txt | 53 --- 14 files changed, 2623 deletions(-) delete mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/danish.txt delete mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/dutch.txt delete mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/english.txt delete mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/finnish.txt delete mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/french.txt delete mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/german.txt delete mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/hungarian.txt delete mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/italian.txt delete mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/norwegian.txt delete mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/portuguese.txt delete mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/russian.txt delete mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/spanish.txt delete mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/swedish.txt delete mode 100644 mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/turkish.txt diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/danish.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/danish.txt deleted file mode 100644 index d3edc6757912e..0000000000000 --- a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/danish.txt +++ /dev/null @@ -1,94 +0,0 @@ -og -i -jeg -det -at -en -den -til -er -som -på -de -med -han -af -for -ikke -der -var -mig -sig -men -et -har -om -vi -min -havde -ham -hun -nu -over -da -fra -du -ud -sin -dem -os -op -man -hans -hvor -eller -hvad -skal -selv -her -alle -vil -blev -kunne -ind -når -være -dog -noget -ville -jo -deres -efter -ned -skulle -denne -end -dette -mit -også -under -have -dig -anden -hende -mine -alt -meget -sit -sine -vor -mod -disse -hvis -din -nogle -hos -blive -mange -ad -bliver -hendes -været -thi -jer -sådan diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/dutch.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/dutch.txt deleted file mode 100644 index cafa0324b5376..0000000000000 --- a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/dutch.txt +++ /dev/null @@ -1,101 +0,0 @@ -de -en -van -ik -te -dat -die -in -een -hij -het -niet -zijn -is -was -op -aan -met -als -voor -had -er -maar -om -hem -dan -zou -of -wat -mijn -men -dit -zo -door -over -ze -zich -bij -ook -tot -je -mij -uit -der -daar -haar -naar -heb -hoe -heeft -hebben -deze -u -want -nog -zal -me -zij -nu -ge -geen -omdat -iets -worden -toch -al -waren -veel -meer -doen -toen -moet -ben -zonder -kan -hun -dus -alles -onder -ja -eens -hier -wie -werd -altijd -doch -wordt -wezen -kunnen -ons -zelf -tegen -na -reeds -wil -kon -niets -uw -iemand -geweest -andere diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/english.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/english.txt deleted file mode 100644 index 61e5350dcde39..0000000000000 --- a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/english.txt +++ /dev/null @@ -1,319 +0,0 @@ -a -about -above -across -after -afterwards -again -against -all -almost -alone -along -already -also -although -always -am -among -amongst -amoungst -amount -an -and -another -any -anyhow -anyone -anything -anyway -anywhere -are -around -as -at -back -be -became -because -become -becomes -becoming -been -before -beforehand -behind -being -below -beside -besides -between -beyond -bill -both -bottom -but -by -call -can -cannot -cant -co -computer -con -could -couldnt -cry -de -describe -detail -do -done -down -due -during -each -eg -eight -either -eleven -else -elsewhere -empty -enough -etc -even -ever -every -everyone -everything -everywhere -except -few -fifteen -fify -fill -find -fire -first -five -for -former -formerly -forty -found -four -from -front -full -further -get -give -go -had -has -hasnt -have -he -hence -her -here -hereafter -hereby -herein -hereupon -hers -herself -him -himself -his -how -however -hundred -i -ie -if -in -inc -indeed -interest -into -is -it -its -itself -keep -last -latter -latterly -least -less -ltd -made -many -may -me -meanwhile -might -mill -mine -more -moreover -most -mostly -move -much -must -my -myself -name -namely -neither -never -nevertheless -next -nine -no -nobody -none -noone -nor -not -nothing -now -nowhere -of -off -often -on -once -one -only -onto -or -other -others -otherwise -our -ours -ourselves -out -over -own -part -per -perhaps -please -put -rather -re -same -see -seem -seemed -seeming -seems -serious -several -she -should -show -side -since -sincere -six -sixty -so -some -somehow -someone -something -sometime -sometimes -somewhere -still -such -system -take -ten -than -that -the -their -them -themselves -then -thence -there -thereafter -thereby -therefore -therein -thereupon -these -they -thick -thin -third -this -those -though -three -through -throughout -thru -thus -to -together -too -top -toward -towards -twelve -twenty -two -un -under -until -up -upon -us -very -via -was -we -well -were -what -whatever -when -whence -whenever -where -whereafter -whereas -whereby -wherein -whereupon -wherever -whether -which -while -whither -who -whoever -whole -whom -whose -why -will -with -within -without -would -yet -you -your -yours -yourself -yourselves diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/finnish.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/finnish.txt deleted file mode 100644 index 47ee200f6781d..0000000000000 --- a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/finnish.txt +++ /dev/null @@ -1,235 +0,0 @@ -olla -olen -olet -on -olemme -olette -ovat -ole -oli -olisi -olisit -olisin -olisimme -olisitte -olisivat -olit -olin -olimme -olitte -olivat -ollut -olleet -en -et -ei -emme -ette -eivät -minä -minun -minut -minua -minussa -minusta -minuun -minulla -minulta -minulle -sinä -sinun -sinut -sinua -sinussa -sinusta -sinuun -sinulla -sinulta -sinulle -hän -hänen -hänet -häntä -hänessä -hänestä -häneen -hänellä -häneltä -hänelle -me -meidän -meidät -meitä -meissä -meistä -meihin -meillä -meiltä -meille -te -teidän -teidät -teitä -teissä -teistä -teihin -teillä -teiltä -teille -he -heidän -heidät -heitä -heissä -heistä -heihin -heillä -heiltä -heille -tämä -tämän -tätä -tässä -tästä -tähän -tallä -tältä -tälle -tänä -täksi -tuo -tuon -tuotä -tuossa -tuosta -tuohon -tuolla -tuolta -tuolle -tuona -tuoksi -se -sen -sitä -siinä -siitä -siihen -sillä -siltä -sille -sinä -siksi -nämä -näiden -näitä -näissä -näistä -näihin -näillä -näiltä -näille -näinä -näiksi -nuo -noiden -noita -noissa -noista -noihin -noilla -noilta -noille -noina -noiksi -ne -niiden -niitä -niissä -niistä -niihin -niillä -niiltä -niille -niinä -niiksi -kuka -kenen -kenet -ketä -kenessä -kenestä -keneen -kenellä -keneltä -kenelle -kenenä -keneksi -ketkä -keiden -ketkä -keitä -keissä -keistä -keihin -keillä -keiltä -keille -keinä -keiksi -mikä -minkä -minkä -mitä -missä -mistä -mihin -millä -miltä -mille -minä -miksi -mitkä -joka -jonka -jota -jossa -josta -johon -jolla -jolta -jolle -jona -joksi -jotka -joiden -joita -joissa -joista -joihin -joilla -joilta -joille -joina -joiksi -että -ja -jos -koska -kuin -mutta -niin -sekä -sillä -tai -vaan -vai -vaikka -kanssa -mukaan -noin -poikki -yli -kun -niin -nyt -itse diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/french.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/french.txt deleted file mode 100644 index e7cbf4c975001..0000000000000 --- a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/french.txt +++ /dev/null @@ -1,155 +0,0 @@ -au -aux -avec -ce -ces -dans -de -des -du -elle -en -et -eux -il -je -la -le -leur -lui -ma -mais -me -même -mes -moi -mon -ne -nos -notre -nous -on -ou -par -pas -pour -qu -que -qui -sa -se -ses -son -sur -ta -te -tes -toi -ton -tu -un -une -vos -votre -vous -c -d -j -l -à -m -n -s -t -y -été -étée -étées -étés -étant -étante -étants -étantes -suis -es -est -sommes -êtes -sont -serai -seras -sera -serons -serez -seront -serais -serait -serions -seriez -seraient -étais -était -étions -étiez -étaient -fus -fut -fûmes -fûtes -furent -sois -soit -soyons -soyez -soient -fusse -fusses -fût -fussions -fussiez -fussent -ayant -ayante -ayantes -ayants -eu -eue -eues -eus -ai -as -avons -avez -ont -aurai -auras -aura -aurons -aurez -auront -aurais -aurait -aurions -auriez -auraient -avais -avait -avions -aviez -avaient -eut -eûmes -eûtes -eurent -aie -aies -ait -ayons -ayez -aient -eusse -eusses -eût -eussions -eussiez -eussent diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/german.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/german.txt deleted file mode 100644 index edef220b7a7da..0000000000000 --- a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/german.txt +++ /dev/null @@ -1,231 +0,0 @@ -aber -alle -allem -allen -aller -alles -als -also -am -an -ander -andere -anderem -anderen -anderer -anderes -anderm -andern -anderr -anders -auch -auf -aus -bei -bin -bis -bist -da -damit -dann -der -den -des -dem -die -das -daß -derselbe -derselben -denselben -desselben -demselben -dieselbe -dieselben -dasselbe -dazu -dein -deine -deinem -deinen -deiner -deines -denn -derer -dessen -dich -dir -du -dies -diese -diesem -diesen -dieser -dieses -doch -dort -durch -ein -eine -einem -einen -einer -eines -einig -einige -einigem -einigen -einiger -einiges -einmal -er -ihn -ihm -es -etwas -euer -eure -eurem -euren -eurer -eures -für -gegen -gewesen -hab -habe -haben -hat -hatte -hatten -hier -hin -hinter -ich -mich -mir -ihr -ihre -ihrem -ihren -ihrer -ihres -euch -im -in -indem -ins -ist -jede -jedem -jeden -jeder -jedes -jene -jenem -jenen -jener -jenes -jetzt -kann -kein -keine -keinem -keinen -keiner -keines -können -könnte -machen -man -manche -manchem -manchen -mancher -manches -mein -meine -meinem -meinen -meiner -meines -mit -muss -musste -nach -nicht -nichts -noch -nun -nur -ob -oder -ohne -sehr -sein -seine -seinem -seinen -seiner -seines -selbst -sich -sie -ihnen -sind -so -solche -solchem -solchen -solcher -solches -soll -sollte -sondern -sonst -über -um -und -uns -unse -unsem -unsen -unser -unses -unter -viel -vom -von -vor -während -war -waren -warst -was -weg -weil -weiter -welche -welchem -welchen -welcher -welches -wenn -werde -werden -wie -wieder -will -wir -wird -wirst -wo -wollen -wollte -würde -würden -zu -zum -zur -zwar -zwischen diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/hungarian.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/hungarian.txt deleted file mode 100644 index 94e9f9a0b07a6..0000000000000 --- a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/hungarian.txt +++ /dev/null @@ -1,199 +0,0 @@ -a -ahogy -ahol -aki -akik -akkor -alatt -által -általában -amely -amelyek -amelyekben -amelyeket -amelyet -amelynek -ami -amit -amolyan -amíg -amikor -át -abban -ahhoz -annak -arra -arról -az -azok -azon -azt -azzal -azért -aztán -azután -azonban -bár -be -belül -benne -cikk -cikkek -cikkeket -csak -de -e -eddig -egész -egy -egyes -egyetlen -egyéb -egyik -egyre -ekkor -el -elég -ellen -elõ -elõször -elõtt -elsõ -én -éppen -ebben -ehhez -emilyen -ennek -erre -ez -ezt -ezek -ezen -ezzel -ezért -és -fel -felé -hanem -hiszen -hogy -hogyan -igen -így -illetve -ill. -ill -ilyen -ilyenkor -ison -ismét -itt -jó -jól -jobban -kell -kellett -keresztül -keressünk -ki -kívül -között -közül -legalább -lehet -lehetett -legyen -lenne -lenni -lesz -lett -maga -magát -majd -majd -már -más -másik -meg -még -mellett -mert -mely -melyek -mi -mit -míg -miért -milyen -mikor -minden -mindent -mindenki -mindig -mint -mintha -mivel -most -nagy -nagyobb -nagyon -ne -néha -nekem -neki -nem -néhány -nélkül -nincs -olyan -ott -össze -õ -õk -õket -pedig -persze -rá -s -saját -sem -semmi -sok -sokat -sokkal -számára -szemben -szerint -szinte -talán -tehát -teljes -tovább -továbbá -több -úgy -ugyanis -új -újabb -újra -után -utána -utolsó -vagy -vagyis -valaki -valami -valamint -való -vagyok -van -vannak -volt -voltam -voltak -voltunk -vissza -vele -viszont -volna diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/italian.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/italian.txt deleted file mode 100644 index 6ee02b51fb171..0000000000000 --- a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/italian.txt +++ /dev/null @@ -1,279 +0,0 @@ -ad -al -allo -ai -agli -all -agl -alla -alle -con -col -coi -da -dal -dallo -dai -dagli -dall -dagl -dalla -dalle -di -del -dello -dei -degli -dell -degl -della -delle -in -nel -nello -nei -negli -nell -negl -nella -nelle -su -sul -sullo -sui -sugli -sull -sugl -sulla -sulle -per -tra -contro -io -tu -lui -lei -noi -voi -loro -mio -mia -miei -mie -tuo -tua -tuoi -tue -suo -sua -suoi -sue -nostro -nostra -nostri -nostre -vostro -vostra -vostri -vostre -mi -ti -ci -vi -lo -la -li -le -gli -ne -il -un -uno -una -ma -ed -se -perché -anche -come -dov -dove -che -chi -cui -non -più -quale -quanto -quanti -quanta -quante -quello -quelli -quella -quelle -questo -questi -questa -queste -si -tutto -tutti -a -c -e -i -l -o -ho -hai -ha -abbiamo -avete -hanno -abbia -abbiate -abbiano -avrò -avrai -avrà -avremo -avrete -avranno -avrei -avresti -avrebbe -avremmo -avreste -avrebbero -avevo -avevi -aveva -avevamo -avevate -avevano -ebbi -avesti -ebbe -avemmo -aveste -ebbero -avessi -avesse -avessimo -avessero -avendo -avuto -avuta -avuti -avute -sono -sei -è -siamo -siete -sia -siate -siano -sarò -sarai -sarà -saremo -sarete -saranno -sarei -saresti -sarebbe -saremmo -sareste -sarebbero -ero -eri -era -eravamo -eravate -erano -fui -fosti -fu -fummo -foste -furono -fossi -fosse -fossimo -fossero -essendo -faccio -fai -facciamo -fanno -faccia -facciate -facciano -farò -farai -farà -faremo -farete -faranno -farei -faresti -farebbe -faremmo -fareste -farebbero -facevo -facevi -faceva -facevamo -facevate -facevano -feci -facesti -fece -facemmo -faceste -fecero -facessi -facesse -facessimo -facessero -facendo -sto -stai -sta -stiamo -stanno -stia -stiate -stiano -starò -starai -starà -staremo -starete -staranno -starei -staresti -starebbe -staremmo -stareste -starebbero -stavo -stavi -stava -stavamo -stavate -stavano -stetti -stesti -stette -stemmo -steste -stettero -stessi -stesse -stessimo -stessero -stando diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/norwegian.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/norwegian.txt deleted file mode 100644 index 9ac1abbb6cba1..0000000000000 --- a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/norwegian.txt +++ /dev/null @@ -1,176 +0,0 @@ -og -i -jeg -det -at -en -et -den -til -er -som -på -de -med -han -av -ikke -ikkje -der -så -var -meg -seg -men -ett -har -om -vi -min -mitt -ha -hadde -hun -nå -over -da -ved -fra -du -ut -sin -dem -oss -opp -man -kan -hans -hvor -eller -hva -skal -selv -sjøl -her -alle -vil -bli -ble -blei -blitt -kunne -inn -når -være -kom -noen -noe -ville -dere -som -deres -kun -ja -etter -ned -skulle -denne -for -deg -si -sine -sitt -mot -å -meget -hvorfor -dette -disse -uten -hvordan -ingen -din -ditt -blir -samme -hvilken -hvilke -sånn -inni -mellom -vår -hver -hvem -vors -hvis -både -bare -enn -fordi -før -mange -også -slik -vært -være -båe -begge -siden -dykk -dykkar -dei -deira -deires -deim -di -då -eg -ein -eit -eitt -elles -honom -hjå -ho -hoe -henne -hennar -hennes -hoss -hossen -ikkje -ingi -inkje -korleis -korso -kva -kvar -kvarhelst -kven -kvi -kvifor -me -medan -mi -mine -mykje -no -nokon -noka -nokor -noko -nokre -si -sia -sidan -so -somt -somme -um -upp -vere -vore -verte -vort -varte -vart diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/portuguese.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/portuguese.txt deleted file mode 100644 index 6b2477863b7bb..0000000000000 --- a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/portuguese.txt +++ /dev/null @@ -1,203 +0,0 @@ -de -a -o -que -e -do -da -em -um -para -com -não -uma -os -no -se -na -por -mais -as -dos -como -mas -ao -ele -das -à -seu -sua -ou -quando -muito -nos -já -eu -também -só -pelo -pela -até -isso -ela -entre -depois -sem -mesmo -aos -seus -quem -nas -me -esse -eles -você -essa -num -nem -suas -meu -às -minha -numa -pelos -elas -qual -nós -lhe -deles -essas -esses -pelas -este -dele -tu -te -vocês -vos -lhes -meus -minhas -teu -tua -teus -tuas -nosso -nossa -nossos -nossas -dela -delas -esta -estes -estas -aquele -aquela -aqueles -aquelas -isto -aquilo -estou -está -estamos -estão -estive -esteve -estivemos -estiveram -estava -estávamos -estavam -estivera -estivéramos -esteja -estejamos -estejam -estivesse -estivéssemos -estivessem -estiver -estivermos -estiverem -hei -há -havemos -hão -houve -houvemos -houveram -houvera -houvéramos -haja -hajamos -hajam -houvesse -houvéssemos -houvessem -houver -houvermos -houverem -houverei -houverá -houveremos -houverão -houveria -houveríamos -houveriam -sou -somos -são -era -éramos -eram -fui -foi -fomos -foram -fora -fôramos -seja -sejamos -sejam -fosse -fôssemos -fossem -for -formos -forem -serei -será -seremos -serão -seria -seríamos -seriam -tenho -tem -temos -tém -tinha -tínhamos -tinham -tive -teve -tivemos -tiveram -tivera -tivéramos -tenha -tenhamos -tenham -tivesse -tivéssemos -tivessem -tiver -tivermos -tiverem -terei -terá -teremos -terão -teria -teríamos -teriam diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/russian.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/russian.txt deleted file mode 100644 index ecb83d4a7f393..0000000000000 --- a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/russian.txt +++ /dev/null @@ -1,151 +0,0 @@ -и -в -во -не -что -он -на -я -с -со -как -а -то -все -она -так -его -но -да -ты -к -у -же -вы -за -бы -по -только -ее -мне -было -вот -от -меня -еще -нет -о -из -ему -теперь -когда -даже -ну -вдруг -ли -если -уже -или -ни -быть -был -него -до -вас -нибудь -опять -уж -вам -ведь -там -потом -себя -ничего -ей -может -они -тут -где -есть -надо -ней -для -мы -тебя -их -чем -была -сам -чтоб -без -будто -чего -раз -тоже -себе -под -будет -ж -тогда -кто -этот -того -потому -этого -какой -совсем -ним -здесь -этом -один -почти -мой -тем -чтобы -нее -сейчас -были -куда -зачем -всех -никогда -можно -при -наконец -два -об -другой -хоть -после -над -больше -тот -через -эти -нас -про -всего -них -какая -много -разве -три -эту -моя -впрочем -хорошо -свою -этой -перед -иногда -лучше -чуть -том -нельзя -такой -им -более -всегда -конечно -всю -между diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/spanish.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/spanish.txt deleted file mode 100644 index 59bc786caa490..0000000000000 --- a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/spanish.txt +++ /dev/null @@ -1,313 +0,0 @@ -de -la -que -el -en -y -a -los -del -se -las -por -un -para -con -no -una -su -al -lo -como -más -pero -sus -le -ya -o -este -sí -porque -esta -entre -cuando -muy -sin -sobre -también -me -hasta -hay -donde -quien -desde -todo -nos -durante -todos -uno -les -ni -contra -otros -ese -eso -ante -ellos -e -esto -mí -antes -algunos -qué -unos -yo -otro -otras -otra -él -tanto -esa -estos -mucho -quienes -nada -muchos -cual -poco -ella -estar -estas -algunas -algo -nosotros -mi -mis -tú -te -ti -tu -tus -ellas -nosotras -vosostros -vosostras -os -mío -mía -míos -mías -tuyo -tuya -tuyos -tuyas -suyo -suya -suyos -suyas -nuestro -nuestra -nuestros -nuestras -vuestro -vuestra -vuestros -vuestras -esos -esas -estoy -estás -está -estamos -estáis -están -esté -estés -estemos -estéis -estén -estaré -estarás -estará -estaremos -estaréis -estarán -estaría -estarías -estaríamos -estaríais -estarían -estaba -estabas -estábamos -estabais -estaban -estuve -estuviste -estuvo -estuvimos -estuvisteis -estuvieron -estuviera -estuvieras -estuviéramos -estuvierais -estuvieran -estuviese -estuvieses -estuviésemos -estuvieseis -estuviesen -estando -estado -estada -estados -estadas -estad -he -has -ha -hemos -habéis -han -haya -hayas -hayamos -hayáis -hayan -habré -habrás -habrá -habremos -habréis -habrán -habría -habrías -habríamos -habríais -habrían -había -habías -habíamos -habíais -habían -hube -hubiste -hubo -hubimos -hubisteis -hubieron -hubiera -hubieras -hubiéramos -hubierais -hubieran -hubiese -hubieses -hubiésemos -hubieseis -hubiesen -habiendo -habido -habida -habidos -habidas -soy -eres -es -somos -sois -son -sea -seas -seamos -seáis -sean -seré -serás -será -seremos -seréis -serán -sería -serías -seríamos -seríais -serían -era -eras -éramos -erais -eran -fui -fuiste -fue -fuimos -fuisteis -fueron -fuera -fueras -fuéramos -fuerais -fueran -fuese -fueses -fuésemos -fueseis -fuesen -sintiendo -sentido -sentida -sentidos -sentidas -siente -sentid -tengo -tienes -tiene -tenemos -tenéis -tienen -tenga -tengas -tengamos -tengáis -tengan -tendré -tendrás -tendrá -tendremos -tendréis -tendrán -tendría -tendrías -tendríamos -tendríais -tendrían -tenía -tenías -teníamos -teníais -tenían -tuve -tuviste -tuvo -tuvimos -tuvisteis -tuvieron -tuviera -tuvieras -tuviéramos -tuvierais -tuvieran -tuviese -tuvieses -tuviésemos -tuvieseis -tuviesen -teniendo -tenido -tenida -tenidos -tenidas -tened diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/swedish.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/swedish.txt deleted file mode 100644 index 742bb6263b99f..0000000000000 --- a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/swedish.txt +++ /dev/null @@ -1,114 +0,0 @@ -och -det -att -i -en -jag -hon -som -han -på -den -med -var -sig -för -så -till -är -men -ett -om -hade -de -av -icke -mig -du -henne -då -sin -nu -har -inte -hans -honom -skulle -hennes -där -min -man -ej -vid -kunde -något -från -ut -när -efter -upp -vi -dem -vara -vad -över -än -dig -kan -sina -här -ha -mot -alla -under -någon -eller -allt -mycket -sedan -ju -denna -själv -detta -åt -utan -varit -hur -ingen -mitt -ni -bli -blev -oss -din -dessa -några -deras -blir -mina -samma -vilken -er -sådan -vår -blivit -dess -inom -mellan -sådant -varför -varje -vilka -ditt -vem -vilket -sitta -sådana -vart -dina -vars -vårt -våra -ert -era -vilkas diff --git a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/turkish.txt b/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/turkish.txt deleted file mode 100644 index 5a48ccce0737b..0000000000000 --- a/mllib/src/main/resources/org.apache.spark.ml.feature.stopwords/turkish.txt +++ /dev/null @@ -1,53 +0,0 @@ -acaba -ama -aslında -az -bazı -belki -biri -birkaç -birşey -biz -bu -çok -çünkü -da -daha -de -defa -diye -eğer -en -gibi -hem -hep -hepsi -her -hiç -için -ile -ise -kez -ki -kim -mı -mu -mü -nasıl -ne -neden -nerde -nerede -nereye -niçin -niye -o -sanki -şey -siz -şu -tüm -ve -veya -ya -yani From 28ee249f676971371d11d16c2912bbf81e045269 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Tue, 22 Mar 2016 01:46:42 +0200 Subject: [PATCH 06/22] fix stopwords bug --- .../spark/ml/feature/StopWordsRemover.scala | 74 +++++++++---------- 1 file changed, 34 insertions(+), 40 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala index e7f1d8323376b..56db88950cabc 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala @@ -19,49 +19,40 @@ package org.apache.spark.ml.feature import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.Transformer -import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap, StringArrayParam} +import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StringType, StructType} /** - * stop words list - */ + * stop words list + */ private[spark] object StopWords { + /** Read stop words list from resources */ def readStopWords(language: String): Array[String] = { - val is = getClass.getResourceAsStream(s"/$language.txt") + val is = getClass.getResourceAsStream(s"/org/apache/spark/ml/feature/stopwords/$language.txt") scala.io.Source.fromInputStream(is).getLines().toArray } - lazy val Danish = readStopWords("/danish.txt") - lazy val Dutch = readStopWords("/dutch.txt") - lazy val English = readStopWords("/english.txt") - lazy val Finnish = readStopWords("/finnish.txt") - lazy val French = readStopWords("/french.txt") - lazy val German = readStopWords("/german.txt") - lazy val Hungarian = readStopWords("/hungarian.txt") - lazy val Italian = readStopWords("/italian.txt") - lazy val Norwegian = readStopWords("/norwegian.txt") - lazy val Portuguese = readStopWords("/portuguese.txt") - lazy val Russian = readStopWords("/russian.txt") - lazy val Spanish = readStopWords("/spanish.txt") - lazy val Swedish = readStopWords("/swedish.txt") - lazy val Turkish = readStopWords("/turkish.txt") - - val languageMap = Map("danish" -> Danish, "dutch" -> Dutch, "english" -> English, - "finnish" -> Finnish, "french" -> French, "german" -> German, "hungarian" -> Hungarian, - "italian" -> Italian, "norwegian" -> Norwegian, "portuguese" -> Portuguese, - "russian" -> Russian, "spanish" -> Spanish, "swedish" -> Swedish, "turkish" -> Turkish) + /** Supported languages list must be lowercase */ + val supportedLanguages = Array("danish", "dutch", "english", "finnish", "french", "german", + "hungarian", "italian", "norwegian", "portuguese", "russian", "spanish", "swedish", "turkish") + + /** Languages and stopwords map */ + val languageMap = supportedLanguages.map{ + language => language -> readStopWords(language) + }.toMap } /** - * :: Experimental :: - * A feature transformer that filters out stop words from input. - * Note: null values from input array are preserved unless adding null to stopWords explicitly. - * @see [[http://en.wikipedia.org/wiki/Stop_words]] - */ + * :: Experimental :: + * A feature transformer that filters out stop words from input. + * Note: null values from input array are preserved unless adding null to stopWords explicitly. + * @see [[http://en.wikipedia.org/wiki/Stop_words]] + */ @Experimental class StopWordsRemover(override val uid: String) extends Transformer with HasInputCol with HasOutputCol with DefaultParamsWritable { @@ -75,10 +66,10 @@ class StopWordsRemover(override val uid: String) def setOutputCol(value: String): this.type = set(outputCol, value) /** - * the stop words set to be filtered out - * Default: [[StopWords.English]] - * @group param - */ + * the stop words set to be filtered out + * Default: [[StopWords.English]] + * @group param + */ val stopWords: StringArrayParam = new StringArrayParam(this, "stopWords", "stop words") /** @group setParam */ @@ -91,10 +82,10 @@ class StopWordsRemover(override val uid: String) def getStopWords: Array[String] = $(stopWords) /** - * whether to do a case sensitive comparison over the stop words - * Default: false - * @group param - */ + * whether to do a case sensitive comparison over the stop words + * Default: false + * @group param + */ val caseSensitive: BooleanParam = new BooleanParam(this, "caseSensitive", "whether to do case-sensitive comparison during filtering") @@ -150,13 +141,16 @@ class StopWordsRemover(override val uid: String) /** @group getParam */ def getAdditionalWords: Array[String] = $(additionalWords) - setDefault(stopWords -> StopWords.English, language -> "en", ignoredWords -> Array.empty[String] - , additionalWords -> Array.empty[String], caseSensitive -> false) + setDefault(stopWords -> StopWords.languageMap("english"), + language -> "en", + ignoredWords -> Array.empty[String], + additionalWords -> Array.empty[String], + caseSensitive -> false) override def transform(dataset: DataFrame): DataFrame = { val outputSchema = transformSchema(dataset.schema) val t = if ($(caseSensitive)) { - val stopWordsSet = ($(stopWords) ++ $(additionalWords)).toSet - $(ignoredWords).toSet + val stopWordsSet = ($(stopWords) ++ $(additionalWords)).toSet -- $(ignoredWords).toSet udf { terms: Seq[String] => terms.filter(s => !stopWordsSet.contains(s)) } @@ -164,7 +158,7 @@ class StopWordsRemover(override val uid: String) val toLower = (s: String) => if (s != null) s.toLowerCase else s val lowerStopWords = { ($(stopWords) ++ $(additionalWords)) - .map(toLower(_)).toSet - $(ignoredWords).map(toLower(_)).toSet + .map(toLower(_)).toSet -- $(ignoredWords).map(toLower(_)).toSet } udf { terms: Seq[String] => terms.filter(s => !lowerStopWords.contains(toLower(s))) From 6d215b31a205c4a79e8cc0ef6963d239941e80ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Tue, 22 Mar 2016 01:53:06 +0200 Subject: [PATCH 07/22] update comment lines --- .../scala/org/apache/spark/ml/feature/StopWordsRemover.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala index 56db88950cabc..40674b2aaee03 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala @@ -67,7 +67,7 @@ class StopWordsRemover(override val uid: String) /** * the stop words set to be filtered out - * Default: [[StopWords.English]] + * Default: [[StopWords.languageMap("english")]] * @group param */ val stopWords: StringArrayParam = new StringArrayParam(this, "stopWords", "stop words") @@ -142,7 +142,7 @@ class StopWordsRemover(override val uid: String) def getAdditionalWords: Array[String] = $(additionalWords) setDefault(stopWords -> StopWords.languageMap("english"), - language -> "en", + language -> "english", ignoredWords -> Array.empty[String], additionalWords -> Array.empty[String], caseSensitive -> false) From 6deceecf88c66b3293698aca5d7306c2aa02e2e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Tue, 22 Mar 2016 18:24:38 +0200 Subject: [PATCH 08/22] update stop words list --- .../spark/ml/feature/stopwords/danish.txt | 2 +- .../spark/ml/feature/stopwords/dutch.txt | 2 +- .../spark/ml/feature/stopwords/english.txt | 422 ++++++------------ .../spark/ml/feature/stopwords/finnish.txt | 2 +- .../spark/ml/feature/stopwords/french.txt | 2 +- .../spark/ml/feature/stopwords/german.txt | 2 +- .../spark/ml/feature/stopwords/hungarian.txt | 2 +- .../spark/ml/feature/stopwords/italian.txt | 2 +- .../spark/ml/feature/stopwords/norwegian.txt | 2 +- .../spark/ml/feature/stopwords/portuguese.txt | 2 +- .../spark/ml/feature/stopwords/russian.txt | 2 +- .../spark/ml/feature/stopwords/spanish.txt | 2 +- .../spark/ml/feature/stopwords/swedish.txt | 2 +- .../spark/ml/feature/stopwords/turkish.txt | 2 +- 14 files changed, 141 insertions(+), 307 deletions(-) diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/danish.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/danish.txt index d3edc6757912e..ea9e2c4abe5b9 100644 --- a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/danish.txt +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/danish.txt @@ -91,4 +91,4 @@ hendes været thi jer -sådan +sådan \ No newline at end of file diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/dutch.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/dutch.txt index cafa0324b5376..023cc2c939b2a 100644 --- a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/dutch.txt +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/dutch.txt @@ -98,4 +98,4 @@ niets uw iemand geweest -andere +andere \ No newline at end of file diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/english.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/english.txt index 61e5350dcde39..d075cc0babc3e 100644 --- a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/english.txt +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/english.txt @@ -1,319 +1,153 @@ -a -about -above -across -after -afterwards -again -against -all -almost -alone -along -already -also -although -always +i +me +my +myself +we +our +ours +ourselves +you +your +yours +yourself +yourselves +he +him +his +himself +she +her +hers +herself +it +its +itself +they +them +their +theirs +themselves +what +which +who +whom +this +that +these +those am -among -amongst -amoungst -amount -an -and -another -any -anyhow -anyone -anything -anyway -anywhere +is are -around -as -at -back +was +were be -became -because -become -becomes -becoming been -before -beforehand -behind being -below -beside -besides -between -beyond -bill -both -bottom +have +has +had +having +do +does +did +doing +a +an +the +and but +if +or +because +as +until +while +of +at by -call -can -cannot -cant -co -computer -con -could -couldnt -cry -de -describe -detail -do -done -down -due -during -each -eg -eight -either -eleven -else -elsewhere -empty -enough -etc -even -ever -every -everyone -everything -everywhere -except -few -fifteen -fify -fill -find -fire -first -five for -former -formerly -forty -found -four +with +about +against +between +into +through +during +before +after +above +below +to from -front -full +up +down +in +out +on +off +over +under +again further -get -give -go -had -has -hasnt -have -he -hence -her +then +once here -hereafter -hereby -herein -hereupon -hers -herself -him -himself -his +there +when +where +why how -however -hundred -i -ie -if -in -inc -indeed -interest -into -is -it -its -itself -keep -last -latter -latterly -least -less -ltd -made -many -may -me -meanwhile -might -mill -mine +all +any +both +each +few more -moreover most -mostly -move -much -must -my -myself -name -namely -neither -never -nevertheless -next -nine +other +some +such no -nobody -none -noone nor not -nothing -now -nowhere -of -off -often -on -once -one only -onto -or -other -others -otherwise -our -ours -ourselves -out -over own -part -per -perhaps -please -put -rather -re same -see -seem -seemed -seeming -seems -serious -several -she -should -show -side -since -sincere -six -sixty so -some -somehow -someone -something -sometime -sometimes -somewhere -still -such -system -take -ten than -that -the -their -them -themselves -then -thence -there -thereafter -thereby -therefore -therein -thereupon -these -they -thick -thin -third -this -those -though -three -through -throughout -thru -thus -to -together too -top -toward -towards -twelve -twenty -two -un -under -until -up -upon -us very -via -was -we -well -were -what -whatever -when -whence -whenever -where -whereafter -whereas -whereby -wherein -whereupon -wherever -whether -which -while -whither -who -whoever -whole -whom -whose -why +s +t +can will -with -within -without -would -yet -you -your -yours -yourself -yourselves +just +don +should +now +d +ll +m +o +re +ve +y +ain +aren +couldn +didn +doesn +hadn +hasn +haven +isn +ma +mightn +mustn +needn +shan +shouldn +wasn +weren +won +wouldn diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/finnish.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/finnish.txt index 47ee200f6781d..5b0eb10777d0e 100644 --- a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/finnish.txt +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/finnish.txt @@ -232,4 +232,4 @@ yli kun niin nyt -itse +itse \ No newline at end of file diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/french.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/french.txt index e7cbf4c975001..94b8f8f39a3e1 100644 --- a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/french.txt +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/french.txt @@ -152,4 +152,4 @@ eusses eût eussions eussiez -eussent +eussent \ No newline at end of file diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/german.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/german.txt index edef220b7a7da..7e65190f8ba28 100644 --- a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/german.txt +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/german.txt @@ -228,4 +228,4 @@ zu zum zur zwar -zwischen +zwischen \ No newline at end of file diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/hungarian.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/hungarian.txt index 94e9f9a0b07a6..8d4543a0965d5 100644 --- a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/hungarian.txt +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/hungarian.txt @@ -196,4 +196,4 @@ voltunk vissza vele viszont -volna +volna \ No newline at end of file diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/italian.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/italian.txt index 6ee02b51fb171..783b2e0cbfcd8 100644 --- a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/italian.txt +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/italian.txt @@ -276,4 +276,4 @@ stessi stesse stessimo stessero -stando +stando \ No newline at end of file diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/norwegian.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/norwegian.txt index 9ac1abbb6cba1..cb91702c5e9a9 100644 --- a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/norwegian.txt +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/norwegian.txt @@ -173,4 +173,4 @@ vore verte vort varte -vart +vart \ No newline at end of file diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/portuguese.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/portuguese.txt index 6b2477863b7bb..98b4fdcdf7a20 100644 --- a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/portuguese.txt +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/portuguese.txt @@ -200,4 +200,4 @@ teremos terão teria teríamos -teriam +teriam \ No newline at end of file diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/russian.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/russian.txt index ecb83d4a7f393..8a800b74497dd 100644 --- a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/russian.txt +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/russian.txt @@ -148,4 +148,4 @@ всегда конечно всю -между +между \ No newline at end of file diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/spanish.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/spanish.txt index 59bc786caa490..94f493a8d1e03 100644 --- a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/spanish.txt +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/spanish.txt @@ -310,4 +310,4 @@ tenido tenida tenidos tenidas -tened +tened \ No newline at end of file diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/swedish.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/swedish.txt index 742bb6263b99f..9fae31c1858a9 100644 --- a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/swedish.txt +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/swedish.txt @@ -111,4 +111,4 @@ vårt våra ert era -vilkas +vilkas \ No newline at end of file diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/turkish.txt b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/turkish.txt index 5a48ccce0737b..4e9708d9d2c5e 100644 --- a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/turkish.txt +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/turkish.txt @@ -50,4 +50,4 @@ tüm ve veya ya -yani +yani \ No newline at end of file From 41cd25815af3baa8fe9ed9336812f436d7ed7bd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Tue, 22 Mar 2016 18:25:36 +0200 Subject: [PATCH 09/22] update stopwordsremover --- .../spark/ml/feature/StopWordsRemover.scala | 83 ++++++------------- .../ml/feature/StopWordsRemoverSuite.scala | 26 +++++- 2 files changed, 48 insertions(+), 61 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala index 40674b2aaee03..f3cd55c1984e4 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala @@ -33,18 +33,14 @@ private[spark] object StopWords { /** Read stop words list from resources */ def readStopWords(language: String): Array[String] = { + require(supportedLanguages.contains(language), s"$language is not in language list") val is = getClass.getResourceAsStream(s"/org/apache/spark/ml/feature/stopwords/$language.txt") scala.io.Source.fromInputStream(is).getLines().toArray } /** Supported languages list must be lowercase */ - val supportedLanguages = Array("danish", "dutch", "english", "finnish", "french", "german", + private val supportedLanguages = Set("danish", "dutch", "english", "finnish", "french", "german", "hungarian", "italian", "norwegian", "portuguese", "russian", "spanish", "swedish", "turkish") - - /** Languages and stopwords map */ - val languageMap = supportedLanguages.map{ - language => language -> readStopWords(language) - }.toMap } /** @@ -67,16 +63,13 @@ class StopWordsRemover(override val uid: String) /** * the stop words set to be filtered out - * Default: [[StopWords.languageMap("english")]] + * Default: [[Array.empty]] * @group param */ val stopWords: StringArrayParam = new StringArrayParam(this, "stopWords", "stop words") /** @group setParam */ - def setStopWords(value: Array[String]): this.type = { - set(stopWords, value) - set(language, "unknown") - } + def setStopWords(value: Array[String]): this.type = set(stopWords, value) /** @group getParam */ def getStopWords: Array[String] = $(stopWords) @@ -96,70 +89,39 @@ class StopWordsRemover(override val uid: String) def getCaseSensitive: Boolean = $(caseSensitive) /** - * the language of stop words - * Default: "english" - * @group param - */ + * the language of stop words + * Supported languages: Danish, Dutch, English, Finnish, French, German, Hungarian, + * Italian, Norwegian, Portuguese, Russian, Spanish, Swedish, Turkish + * Default: "English" + * @group param + */ val language: Param[String] = new Param[String](this, "language", "stopwords language") /** @group setParam */ - def setLanguage(value: String): this.type = { - val lang = value.toLowerCase - require(StopWords.languageMap.contains(lang), s"$lang is not in language list") - set(language, lang) - set(stopWords, StopWords.languageMap(lang)) - } + def setLanguage(value: String): this.type = set(language, value.toLowerCase) /** @group getParam */ def getLanguage: String = $(language) - /** - * the ignored stop words set to be ignored out - * Default: [[Array.empty]] - * @group param - */ - val ignoredWords: StringArrayParam = new StringArrayParam(this, "ignoredWords", - "the ignored stop words set to be ignored out") - - /** @group setParam */ - def setIgnoredWords(value: Array[String]): this.type = set(ignoredWords, value) - - /** @group getParam */ - def getIgnoredWords: Array[String] = $(ignoredWords) - - /** - * the additional stop words set to be filtered out - * Default: [[Array.empty]] - * @group param - */ - val additionalWords: StringArrayParam = new StringArrayParam(this, "additionalWords", - "the additional stop words set to be filtered out") - - /** @group setParam */ - def setAdditionalWords(value: Array[String]): this.type = set(additionalWords, value) - - /** @group getParam */ - def getAdditionalWords: Array[String] = $(additionalWords) - - setDefault(stopWords -> StopWords.languageMap("english"), + setDefault(stopWords -> Array.empty[String], language -> "english", - ignoredWords -> Array.empty[String], - additionalWords -> Array.empty[String], caseSensitive -> false) override def transform(dataset: DataFrame): DataFrame = { + val stopWordsSet = if ($(stopWords).isEmpty) { + StopWords.readStopWords($(language)).toSet + } else { + $(stopWords).toSet + } + val outputSchema = transformSchema(dataset.schema) val t = if ($(caseSensitive)) { - val stopWordsSet = ($(stopWords) ++ $(additionalWords)).toSet -- $(ignoredWords).toSet udf { terms: Seq[String] => terms.filter(s => !stopWordsSet.contains(s)) } } else { val toLower = (s: String) => if (s != null) s.toLowerCase else s - val lowerStopWords = { - ($(stopWords) ++ $(additionalWords)) - .map(toLower(_)).toSet -- $(ignoredWords).map(toLower(_)).toSet - } + val lowerStopWords = stopWordsSet.map(toLower(_)).toSet udf { terms: Seq[String] => terms.filter(s => !lowerStopWords.contains(toLower(s))) } @@ -185,4 +147,11 @@ object StopWordsRemover extends DefaultParamsReadable[StopWordsRemover] { @Since("1.6.0") override def load(path: String): StopWordsRemover = super.load(path) + + /** + * Stop words for the language + * Supported languages: Danish, Dutch, English, Finnish, French, German, Hungarian, + * Italian, Norwegian, Portuguese, Russian, Spanish, Swedish, Turkish + */ + def loadStopWords(language: String): Array[String] = StopWords.readStopWords(language.toLowerCase) } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala index 92c177ad68619..44fb1c8f04de8 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala @@ -54,6 +54,24 @@ class StopWordsRemoverSuite testStopWordsRemover(remover, dataSet) } + test("StopWordsRemover with particular stop words list") { + val stopWords = Array("test", "a", "an", "the") + val remover = new StopWordsRemover() + .setInputCol("raw") + .setOutputCol("filtered") + .setStopWords(stopWords) + val dataSet = sqlContext.createDataFrame(Seq( + (Seq("test", "test"), Seq()), + (Seq("a", "b", "c", "d"), Seq("b", "c")), + (Seq("a", "the", "an"), Seq()), + (Seq("A", "The", "AN"), Seq()), + (Seq(null), Seq(null)), + (Seq(), Seq()) + )).toDF("raw", "expected") + + testStopWordsRemover(remover, dataSet) + } + test("StopWordsRemover case sensitive") { val remover = new StopWordsRemover() .setInputCol("raw") @@ -68,11 +86,11 @@ class StopWordsRemoverSuite } test("StopWordsRemover with ignored words") { - val ignoredWords = Array("a") + val stopWords = StopWordsRemover.loadStopWords("english").toSet -- Set("a") val remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol("filtered") - .setIgnoredWords(ignoredWords) + .setStopWords(stopWords.toArray) val dataSet = sqlContext.createDataFrame(Seq( (Seq("python", "scala", "a"), Seq("python", "scala", "a")), (Seq("Python", "Scala", "swift"), Seq("Python", "Scala", "swift")) @@ -82,11 +100,11 @@ class StopWordsRemoverSuite } test("StopWordsRemover with additional words") { - val additionalWords = Array("python", "scala") + val stopWords = StopWordsRemover.loadStopWords("english").toSet ++ Set("python", "scala") val remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol("filtered") - .setAdditionalWords(additionalWords) + .setStopWords(stopWords.toArray) val dataSet = sqlContext.createDataFrame(Seq( (Seq("python", "scala", "a"), Seq()), (Seq("Python", "Scala", "swift"), Seq("swift")) From 4d1812aae64b0b15312940b1a6c42e19f9686480 Mon Sep 17 00:00:00 2001 From: Burak KOSE Date: Tue, 22 Mar 2016 19:35:37 +0200 Subject: [PATCH 10/22] fix test case bug After updating English stop words list, "d" is a stop word. --- .../org/apache/spark/ml/feature/StopWordsRemoverSuite.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala index 44fb1c8f04de8..89727e74f41e2 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala @@ -44,7 +44,7 @@ class StopWordsRemoverSuite .setOutputCol("filtered") val dataSet = sqlContext.createDataFrame(Seq( (Seq("test", "test"), Seq("test", "test")), - (Seq("a", "b", "c", "d"), Seq("b", "c", "d")), + (Seq("a", "b", "c", "d"), Seq("b", "c")), (Seq("a", "the", "an"), Seq()), (Seq("A", "The", "AN"), Seq()), (Seq(null), Seq(null)), From a30862231c3944c55c96cc94e162f61614aee6d5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Tue, 22 Mar 2016 23:45:48 +0200 Subject: [PATCH 11/22] fix encoding --- .../scala/org/apache/spark/ml/feature/StopWordsRemover.scala | 2 +- .../org/apache/spark/ml/feature/StopWordsRemoverSuite.scala | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala index f3cd55c1984e4..ab77d9570e943 100644 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala @@ -35,7 +35,7 @@ private[spark] object StopWords { def readStopWords(language: String): Array[String] = { require(supportedLanguages.contains(language), s"$language is not in language list") val is = getClass.getResourceAsStream(s"/org/apache/spark/ml/feature/stopwords/$language.txt") - scala.io.Source.fromInputStream(is).getLines().toArray + scala.io.Source.fromInputStream(is)(scala.io.Codec.UTF8).getLines().toArray } /** Supported languages list must be lowercase */ diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala index 89727e74f41e2..0c0197fc70e44 100644 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala @@ -62,7 +62,7 @@ class StopWordsRemoverSuite .setStopWords(stopWords) val dataSet = sqlContext.createDataFrame(Seq( (Seq("test", "test"), Seq()), - (Seq("a", "b", "c", "d"), Seq("b", "c")), + (Seq("a", "b", "c", "d"), Seq("b", "c", "d")), (Seq("a", "the", "an"), Seq()), (Seq("A", "The", "AN"), Seq()), (Seq(null), Seq(null)), From 2e7c54e5c17e7c5672a43ffc28acb207e94bf28a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Wed, 23 Mar 2016 03:42:36 +0200 Subject: [PATCH 12/22] fix pyspark test --- python/pyspark/ml/feature.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 5025493c42c38..a17c85ae6e67b 100644 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -1700,8 +1700,8 @@ def __init__(self, inputCol=None, outputCol=None, stopWords=None, super(StopWordsRemover, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StopWordsRemover", self.uid) - stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWords - defaultStopWords = stopWordsObj.English() + stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWordsRemover + defaultStopWords = stopWordsObj.loadStopWords("english") self._setDefault(stopWords=defaultStopWords, caseSensitive=False) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) From 7efda40e39663deef0b0884a7bfca13b5d10d706 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Wed, 23 Mar 2016 18:51:48 +0200 Subject: [PATCH 13/22] add licence for stop words list --- licenses/LICENCE-postgresql.txt | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 licenses/LICENCE-postgresql.txt diff --git a/licenses/LICENCE-postgresql.txt b/licenses/LICENCE-postgresql.txt new file mode 100644 index 0000000000000..515bf9af4d432 --- /dev/null +++ b/licenses/LICENCE-postgresql.txt @@ -0,0 +1,24 @@ +PostgreSQL Database Management System +(formerly known as Postgres, then as Postgres95) + +Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group + +Portions Copyright (c) 1994, The Regents of the University of California + +Permission to use, copy, modify, and distribute this software and its +documentation for any purpose, without fee, and without a written agreement +is hereby granted, provided that the above copyright notice and this +paragraph and the following two paragraphs appear in all copies. + +IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR +DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING +LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS +DOCUMENTATION, EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGE. + +THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, +INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY +AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS +ON AN "AS IS" BASIS, AND THE UNIVERSITY OF CALIFORNIA HAS NO OBLIGATIONS TO +PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. + From a066e8b34ec4824fa26a1e306e197b66400f5ccb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Thu, 24 Mar 2016 19:12:20 +0200 Subject: [PATCH 14/22] change licence to license --- licenses/{LICENCE-postgresql.txt => LICENSE-postgresql.txt} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename licenses/{LICENCE-postgresql.txt => LICENSE-postgresql.txt} (100%) diff --git a/licenses/LICENCE-postgresql.txt b/licenses/LICENSE-postgresql.txt similarity index 100% rename from licenses/LICENCE-postgresql.txt rename to licenses/LICENSE-postgresql.txt From d0f43ace892332dfb3ad25d0ef1d0c0451540e5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Fri, 25 Mar 2016 18:23:37 +0200 Subject: [PATCH 15/22] add readme for stopwords list --- .../org/apache/spark/ml/feature/stopwords/README | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100755 mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/README diff --git a/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/README b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/README new file mode 100755 index 0000000000000..ec08a5080774d --- /dev/null +++ b/mllib/src/main/resources/org/apache/spark/ml/feature/stopwords/README @@ -0,0 +1,12 @@ +Stopwords Corpus + +This corpus contains lists of stop words for several languages. These +are high-frequency grammatical words which are usually ignored in text +retrieval applications. + +They were obtained from: +http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/ + +The English list has been augmented +https://github.com/nltk/nltk_data/issues/22 + From c017ee235287554e28281d1691d0188e358b7ad8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Fri, 25 Mar 2016 18:26:23 +0200 Subject: [PATCH 16/22] merge StopWords into StopWordsRemover --- .../spark/ml/feature/StopWordsRemover.scala | 54 +++++-------------- .../ml/feature/StopWordsRemoverSuite.scala | 27 +++++----- 2 files changed, 28 insertions(+), 53 deletions(-) mode change 100644 => 100755 mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala mode change 100644 => 100755 mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala old mode 100644 new mode 100755 index ab77d9570e943..b991932f36158 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala @@ -26,23 +26,6 @@ import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StringType, StructType} -/** - * stop words list - */ -private[spark] object StopWords { - - /** Read stop words list from resources */ - def readStopWords(language: String): Array[String] = { - require(supportedLanguages.contains(language), s"$language is not in language list") - val is = getClass.getResourceAsStream(s"/org/apache/spark/ml/feature/stopwords/$language.txt") - scala.io.Source.fromInputStream(is)(scala.io.Codec.UTF8).getLines().toArray - } - - /** Supported languages list must be lowercase */ - private val supportedLanguages = Set("danish", "dutch", "english", "finnish", "french", "german", - "hungarian", "italian", "norwegian", "portuguese", "russian", "spanish", "swedish", "turkish") -} - /** * :: Experimental :: * A feature transformer that filters out stop words from input. @@ -88,28 +71,11 @@ class StopWordsRemover(override val uid: String) /** @group getParam */ def getCaseSensitive: Boolean = $(caseSensitive) - /** - * the language of stop words - * Supported languages: Danish, Dutch, English, Finnish, French, German, Hungarian, - * Italian, Norwegian, Portuguese, Russian, Spanish, Swedish, Turkish - * Default: "English" - * @group param - */ - val language: Param[String] = new Param[String](this, "language", "stopwords language") - - /** @group setParam */ - def setLanguage(value: String): this.type = set(language, value.toLowerCase) - - /** @group getParam */ - def getLanguage: String = $(language) - - setDefault(stopWords -> Array.empty[String], - language -> "english", - caseSensitive -> false) + setDefault(stopWords -> Array.empty[String], caseSensitive -> false) override def transform(dataset: DataFrame): DataFrame = { val stopWordsSet = if ($(stopWords).isEmpty) { - StopWords.readStopWords($(language)).toSet + StopWordsRemover.loadStopWords("english").toSet } else { $(stopWords).toSet } @@ -145,13 +111,21 @@ class StopWordsRemover(override val uid: String) @Since("1.6.0") object StopWordsRemover extends DefaultParamsReadable[StopWordsRemover] { + private val supportedLanguages = Set("danish", "dutch", "english", "finnish", "french", "german", + "hungarian", "italian", "norwegian", "portuguese", "russian", "spanish", "swedish", "turkish") + @Since("1.6.0") override def load(path: String): StopWordsRemover = super.load(path) /** - * Stop words for the language - * Supported languages: Danish, Dutch, English, Finnish, French, German, Hungarian, - * Italian, Norwegian, Portuguese, Russian, Spanish, Swedish, Turkish + * Load stop words for the language + * Supported languages: danish, dutch, english, finnish, french, german, hungarian, + * italian, norwegian, portuguese, russian, spanish, swedish, turkish + * @see [[http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/]] */ - def loadStopWords(language: String): Array[String] = StopWords.readStopWords(language.toLowerCase) + def loadStopWords(language: String): Array[String] = { + require(supportedLanguages.contains(language), s"$language is not in language list") + val is = getClass.getResourceAsStream(s"/org/apache/spark/ml/feature/stopwords/$language.txt") + scala.io.Source.fromInputStream(is)(scala.io.Codec.UTF8).getLines().toArray + } } diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala old mode 100644 new mode 100755 index 0c0197fc70e44..0511d1af4db52 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala @@ -85,42 +85,43 @@ class StopWordsRemoverSuite testStopWordsRemover(remover, dataSet) } - test("StopWordsRemover with ignored words") { - val stopWords = StopWordsRemover.loadStopWords("english").toSet -- Set("a") + test("StopWordsRemover with language selection") { + val stopWords = StopWordsRemover.loadStopWords("turkish") val remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol("filtered") - .setStopWords(stopWords.toArray) + .setStopWords(stopWords) val dataSet = sqlContext.createDataFrame(Seq( - (Seq("python", "scala", "a"), Seq("python", "scala", "a")), - (Seq("Python", "Scala", "swift"), Seq("Python", "Scala", "swift")) + (Seq("acaba", "ama", "biri"), Seq()), + (Seq("hep", "her", "scala"), Seq("scala")) )).toDF("raw", "expected") testStopWordsRemover(remover, dataSet) } - test("StopWordsRemover with additional words") { - val stopWords = StopWordsRemover.loadStopWords("english").toSet ++ Set("python", "scala") + test("StopWordsRemover with ignored words") { + val stopWords = StopWordsRemover.loadStopWords("english").toSet -- Set("a") val remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol("filtered") .setStopWords(stopWords.toArray) val dataSet = sqlContext.createDataFrame(Seq( - (Seq("python", "scala", "a"), Seq()), - (Seq("Python", "Scala", "swift"), Seq("swift")) + (Seq("python", "scala", "a"), Seq("python", "scala", "a")), + (Seq("Python", "Scala", "swift"), Seq("Python", "Scala", "swift")) )).toDF("raw", "expected") testStopWordsRemover(remover, dataSet) } - test("StopWordsRemover with language selection") { + test("StopWordsRemover with additional words") { + val stopWords = StopWordsRemover.loadStopWords("english").toSet ++ Set("python", "scala") val remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol("filtered") - .setLanguage("turkish") + .setStopWords(stopWords.toArray) val dataSet = sqlContext.createDataFrame(Seq( - (Seq("acaba", "ama", "biri"), Seq()), - (Seq("hep", "her", "scala"), Seq("scala")) + (Seq("python", "scala", "a"), Seq()), + (Seq("Python", "Scala", "swift"), Seq("swift")) )).toDF("raw", "expected") testStopWordsRemover(remover, dataSet) From 55191ce1f449bed55884a4481071b0fc5ee776a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Fri, 25 Mar 2016 18:27:59 +0200 Subject: [PATCH 17/22] add python stopwords support for language selection --- python/pyspark/ml/feature.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) mode change 100644 => 100755 python/pyspark/ml/feature.py diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py old mode 100644 new mode 100755 index a17c85ae6e67b..c9c5dfc6db51e --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -1700,9 +1700,7 @@ def __init__(self, inputCol=None, outputCol=None, stopWords=None, super(StopWordsRemover, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StopWordsRemover", self.uid) - stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWordsRemover - defaultStopWords = stopWordsObj.loadStopWords("english") - self._setDefault(stopWords=defaultStopWords, caseSensitive=False) + self._setDefault(stopWords=self.loadStopWords("english"), caseSensitive=False) kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -1748,6 +1746,16 @@ def getCaseSensitive(self): """ return self.getOrDefault(self.caseSensitive) + @staticmethod + def loadStopWords(language): + """ + Load stop words for the language + Supported languages: danish, dutch, english, finnish, french, german, hungarian, + italian, norwegian, portuguese, russian, spanish, swedish, turkish + """ + stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWordsRemover + return stopWordsObj.loadStopWords(language) + @inherit_doc @ignore_unicode_prefix From 789342f2d26759db180868a9f59b02c8f85cc835 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Fri, 25 Mar 2016 18:28:48 +0200 Subject: [PATCH 18/22] add new tests for stopwords --- python/pyspark/ml/tests.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) mode change 100644 => 100755 python/pyspark/ml/tests.py diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py old mode 100644 new mode 100755 index 4da9a373e9861..1ae90908341b5 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -336,13 +336,20 @@ def test_stopwordsremover(self): self.assertEqual(stopWordRemover.getInputCol(), "input") transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, ["panda"]) - # Custom + # with particular stop words list stopwords = ["panda"] stopWordRemover.setStopWords(stopwords) self.assertEqual(stopWordRemover.getInputCol(), "input") self.assertEqual(stopWordRemover.getStopWords(), stopwords) transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, ["a"]) + # with language selection + stopwords = StopWordsRemover.loadStopWords("turkish") + dataset = sqlContext.createDataFrame([Row(input=["acaba", "ama", "biri"])]) + stopWordRemover.setStopWords(stopwords) + self.assertEqual(stopWordRemover.getStopWords(), stopwords) + transformedDF = stopWordRemover.transform(dataset) + self.assertEqual(transformedDF.head().output, []) class HasInducedError(Params): From d3e0ad670faf8de805ba74d6aa29ceed89c230bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Wed, 4 May 2016 01:14:34 +0300 Subject: [PATCH 19/22] code review and locale support --- .../spark/ml/feature/StopWordsRemover.scala | 98 +++++++++++-------- .../ml/feature/StopWordsRemoverSuite.scala | 8 +- python/pyspark/ml/feature.py | 43 +++++--- python/pyspark/ml/tests.py | 10 +- 4 files changed, 99 insertions(+), 60 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala index b991932f36158..ca66d012d985a 100755 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala @@ -17,21 +17,23 @@ package org.apache.spark.ml.feature +import java.util.Locale + import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.Transformer -import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap, StringArrayParam} import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} +import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap, StringArrayParam} import org.apache.spark.ml.util._ -import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions.{col, udf} import org.apache.spark.sql.types.{ArrayType, StringType, StructType} +import org.apache.spark.sql.{DataFrame, Dataset} /** - * :: Experimental :: - * A feature transformer that filters out stop words from input. - * Note: null values from input array are preserved unless adding null to stopWords explicitly. - * @see [[http://en.wikipedia.org/wiki/Stop_words]] - */ + * :: Experimental :: + * A feature transformer that filters out stop words from input. + * Note: null values from input array are preserved unless adding null to stopWords explicitly. + * @see [[http://en.wikipedia.org/wiki/Stop_words]] + */ @Experimental class StopWordsRemover(override val uid: String) extends Transformer with HasInputCol with HasOutputCol with DefaultParamsWritable { @@ -45,11 +47,13 @@ class StopWordsRemover(override val uid: String) def setOutputCol(value: String): this.type = set(outputCol, value) /** - * the stop words set to be filtered out - * Default: [[Array.empty]] - * @group param - */ - val stopWords: StringArrayParam = new StringArrayParam(this, "stopWords", "stop words") + * The words to be filtered out. + * Default: English stop words + * @see [[StopWordsRemover.loadStopWords()]] + * @group param + */ + val stopWords: StringArrayParam = + new StringArrayParam(this, "stopWords", "the words to be filtered out") /** @group setParam */ def setStopWords(value: Array[String]): this.type = set(stopWords, value) @@ -58,12 +62,12 @@ class StopWordsRemover(override val uid: String) def getStopWords: Array[String] = $(stopWords) /** - * whether to do a case sensitive comparison over the stop words - * Default: false - * @group param - */ + * Whether to do a case sensitive comparison over the stop words. + * Default: false + * @group param + */ val caseSensitive: BooleanParam = new BooleanParam(this, "caseSensitive", - "whether to do case-sensitive comparison during filtering") + "whether to do a case-sensitive comparison over the stop words") /** @group setParam */ def setCaseSensitive(value: Boolean): this.type = set(caseSensitive, value) @@ -71,34 +75,40 @@ class StopWordsRemover(override val uid: String) /** @group getParam */ def getCaseSensitive: Boolean = $(caseSensitive) - setDefault(stopWords -> Array.empty[String], caseSensitive -> false) + /** + * Locale for doing a case sensitive comparison + * Default: English locale + * @group param + */ + val locale: Param[String] = new Param[String](this, "locale", + "locale for doing a case sensitive comparison") - override def transform(dataset: DataFrame): DataFrame = { - val stopWordsSet = if ($(stopWords).isEmpty) { - StopWordsRemover.loadStopWords("english").toSet - } else { + /** @group setParam */ + def setLocale(value: String): this.type = set(locale, value) + + /** @group getParam */ + def getLocale: String = $(locale) + + setDefault(stopWords -> StopWordsRemover.loadStopWords("english"), + caseSensitive -> false, locale -> "en") + + @Since("2.0.0") + override def transform(dataset: Dataset[_]): DataFrame = { + val stopWordsSet = if ($(caseSensitive)) { $(stopWords).toSet + } else { + val loadedLocale = StopWordsRemover.loadLocale($(locale)) + $(stopWords).filterNot(_ == null).map(_.toLowerCase(loadedLocale)).toSet } - val outputSchema = transformSchema(dataset.schema) - val t = if ($(caseSensitive)) { - udf { terms: Seq[String] => - terms.filter(s => !stopWordsSet.contains(s)) - } - } else { - val toLower = (s: String) => if (s != null) s.toLowerCase else s - val lowerStopWords = stopWordsSet.map(toLower(_)).toSet - udf { terms: Seq[String] => - terms.filter(s => !lowerStopWords.contains(toLower(s))) - } + val t = udf { terms: Seq[String] => + terms.filterNot(stopWordsSet.contains) } - val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata)) } override def transformSchema(schema: StructType): StructType = { - validateParams() val inputType = schema($(inputCol)).dataType require(inputType.sameType(ArrayType(StringType)), s"Input type must be ArrayType(StringType) but got $inputType.") @@ -111,6 +121,8 @@ class StopWordsRemover(override val uid: String) @Since("1.6.0") object StopWordsRemover extends DefaultParamsReadable[StopWordsRemover] { + private def loadLocale(value : String) = new Locale(value) + private val supportedLanguages = Set("danish", "dutch", "english", "finnish", "french", "german", "hungarian", "italian", "norwegian", "portuguese", "russian", "spanish", "swedish", "turkish") @@ -118,14 +130,16 @@ object StopWordsRemover extends DefaultParamsReadable[StopWordsRemover] { override def load(path: String): StopWordsRemover = super.load(path) /** - * Load stop words for the language - * Supported languages: danish, dutch, english, finnish, french, german, hungarian, - * italian, norwegian, portuguese, russian, spanish, swedish, turkish - * @see [[http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/]] - */ + * Load stop words for the language + * Supported languages: danish, dutch, english, finnish, french, german, hungarian, + * italian, norwegian, portuguese, russian, spanish, swedish, turkish + * @see [[http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/]] + */ + @Since("2.0.0") def loadStopWords(language: String): Array[String] = { - require(supportedLanguages.contains(language), s"$language is not in language list") + require(supportedLanguages.contains(language), + s"$language is not in the supported language list: ${supportedLanguages.mkString(", ")}.") val is = getClass.getResourceAsStream(s"/org/apache/spark/ml/feature/stopwords/$language.txt") scala.io.Source.fromInputStream(is)(scala.io.Codec.UTF8).getLines().toArray } -} +} \ No newline at end of file diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala index 0511d1af4db52..2abcda9e9bcfe 100755 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala @@ -20,16 +20,16 @@ package org.apache.spark.ml.feature import org.apache.spark.SparkFunSuite import org.apache.spark.ml.util.DefaultReadWriteTest import org.apache.spark.mllib.util.MLlibTestSparkContext -import org.apache.spark.sql.{DataFrame, Row} +import org.apache.spark.sql.{Dataset, Row} object StopWordsRemoverSuite extends SparkFunSuite { - def testStopWordsRemover(t: StopWordsRemover, dataset: DataFrame): Unit = { + def testStopWordsRemover(t: StopWordsRemover, dataset: Dataset[_]): Unit = { t.transform(dataset) .select("filtered", "expected") .collect() .foreach { case Row(tokens, wantedTokens) => assert(tokens === wantedTokens) - } + } } } @@ -150,4 +150,4 @@ class StopWordsRemoverSuite } assert(thrown.getMessage == s"requirement failed: Column $outputCol already exists.") } -} +} \ No newline at end of file diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index c9c5dfc6db51e..f4afa0e73070d 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -1664,7 +1664,7 @@ def getLabels(self): return self.getOrDefault(self.labels) -class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol, MLReadable, MLWritable): +class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadable, JavaMLWritable): """ .. note:: Experimental @@ -1686,31 +1686,32 @@ class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol, MLReadable, M .. versionadded:: 1.6.0 """ - stopWords = Param(Params._dummy(), "stopWords", "The words to be filtered out") + stopWords = Param(Params._dummy(), "stopWords", "The words to be filtered out", + typeConverter=TypeConverters.toListString) caseSensitive = Param(Params._dummy(), "caseSensitive", "whether to do a case sensitive " + - "comparison over the stop words") + "comparison over the stop words", typeConverter=TypeConverters.toBoolean) + locale = Param(Params._dummy(), "locale", "locale for doing a case sensitive comparison", + typeConverter=TypeConverters.toString) @keyword_only def __init__(self, inputCol=None, outputCol=None, stopWords=None, - caseSensitive=False): + caseSensitive=False, locale="en"): """ __init__(self, inputCol=None, outputCol=None, stopWords=None,\ - caseSensitive=false) + caseSensitive=false, locale="en") """ super(StopWordsRemover, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StopWordsRemover", self.uid) - self._setDefault(stopWords=self.loadStopWords("english"), caseSensitive=False) + self._setDefault(stopWords=StopWordsRemover.loadStopWords("english"), caseSensitive=False, locale="en") kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @keyword_only @since("1.6.0") - def setParams(self, inputCol=None, outputCol=None, stopWords=None, - caseSensitive=False): + def setParams(self, inputCol=None, outputCol=None, stopWords=None, caseSensitive=False, locale="en"): """ - setParams(self, inputCol="input", outputCol="output", stopWords=None,\ - caseSensitive=false) + setParams(self, inputCol="input", outputCol="output", stopWords=None, caseSensitive=false, locale="en") Sets params for this StopWordRemover. """ kwargs = self.setParams._input_kwargs @@ -1721,7 +1722,7 @@ def setStopWords(self, value): """ Specify the stopwords to be filtered. """ - self._paramMap[self.stopWords] = value + self._set(stopWords=value) return self @since("1.6.0") @@ -1736,7 +1737,7 @@ def setCaseSensitive(self, value): """ Set whether to do a case sensitive comparison over the stop words """ - self._paramMap[self.caseSensitive] = value + self._set(caseSensitive=value) return self @since("1.6.0") @@ -1746,7 +1747,23 @@ def getCaseSensitive(self): """ return self.getOrDefault(self.caseSensitive) + @since("2.0.0") + def setLocale(self, value): + """ + Set locale for doing a case sensitive comparison + """ + self._set(caseSensitive=value) + return self + + @since("2.0.0") + def getLocale(self): + """ + Get locale for doing a case sensitive comparison + """ + return self.getOrDefault(self.caseSensitive) + @staticmethod + @since("2.0.0") def loadStopWords(language): """ Load stop words for the language @@ -1754,7 +1771,7 @@ def loadStopWords(language): italian, norwegian, portuguese, russian, spanish, swedish, turkish """ stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWordsRemover - return stopWordsObj.loadStopWords(language) + return list(stopWordsObj.loadStopWords(language)) @inherit_doc diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 1ae90908341b5..b517cd9061c97 100755 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -113,7 +113,6 @@ def test_int_to_float(self): lr.fit(df) def test_invalid_to_float(self): - from pyspark.mllib.linalg import Vectors self.assertRaises(Exception, lambda: LogisticRegression(elasticNetParam="happy")) lr = LogisticRegression(elasticNetParam=0) self.assertRaises(Exception, lambda: lr.setElasticNetParam("panda")) @@ -336,6 +335,8 @@ def test_stopwordsremover(self): self.assertEqual(stopWordRemover.getInputCol(), "input") transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, ["panda"]) + self.assertEqual(type(stopWordRemover.getStopWords()), list) + self.assertTrue(isinstance(stopWordRemover.getStopWords()[0], basestring)) # with particular stop words list stopwords = ["panda"] stopWordRemover.setStopWords(stopwords) @@ -350,6 +351,13 @@ def test_stopwordsremover(self): self.assertEqual(stopWordRemover.getStopWords(), stopwords) transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, []) + # with locale + stopwords = ["BİRİ"] + dataset = sqlContext.createDataFrame([Row(input=["biri"])]) + stopWordRemover.setStopWords(stopwords).setLocale("tr") + self.assertEqual(stopWordRemover.getStopWords(), stopwords) + transformedDF = stopWordRemover.transform(dataset) + self.assertEqual(transformedDF.head().output, []) class HasInducedError(Params): From cb786eef0f75aa19d9416ed1c07b90510b4bf70b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Wed, 4 May 2016 02:33:54 +0300 Subject: [PATCH 20/22] fix stylecheck --- .../spark/ml/feature/StopWordsRemover.scala | 40 +++++++++---------- .../ml/feature/StopWordsRemoverSuite.scala | 2 +- python/pyspark/ml/feature.py | 9 +++-- 3 files changed, 27 insertions(+), 24 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala index 96aa7bb8bde17..f9efb7a789a05 100755 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala @@ -21,8 +21,8 @@ import java.util.Locale import org.apache.spark.annotation.{Experimental, Since} import org.apache.spark.ml.Transformer -import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.param.{BooleanParam, Param, ParamMap, StringArrayParam} +import org.apache.spark.ml.param.shared.{HasInputCol, HasOutputCol} import org.apache.spark.ml.util._ import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.{col, udf} @@ -47,11 +47,11 @@ class StopWordsRemover(override val uid: String) def setOutputCol(value: String): this.type = set(outputCol, value) /** - * The words to be filtered out. - * Default: English stop words - * @see [[StopWordsRemover.loadStopWords()]] - * @group param - */ + * The words to be filtered out. + * Default: English stop words + * @see [[StopWordsRemover.loadStopWords()]] + * @group param + */ val stopWords: StringArrayParam = new StringArrayParam(this, "stopWords", "the words to be filtered out") @@ -62,10 +62,10 @@ class StopWordsRemover(override val uid: String) def getStopWords: Array[String] = $(stopWords) /** - * Whether to do a case sensitive comparison over the stop words. - * Default: false - * @group param - */ + * Whether to do a case sensitive comparison over the stop words. + * Default: false + * @group param + */ val caseSensitive: BooleanParam = new BooleanParam(this, "caseSensitive", "whether to do a case-sensitive comparison over the stop words") @@ -76,10 +76,10 @@ class StopWordsRemover(override val uid: String) def getCaseSensitive: Boolean = $(caseSensitive) /** - * Locale for doing a case sensitive comparison - * Default: English locale - * @group param - */ + * Locale for doing a case sensitive comparison + * Default: English locale + * @group param + */ val locale: Param[String] = new Param[String](this, "locale", "locale for doing a case sensitive comparison") @@ -130,11 +130,11 @@ object StopWordsRemover extends DefaultParamsReadable[StopWordsRemover] { override def load(path: String): StopWordsRemover = super.load(path) /** - * Load stop words for the language - * Supported languages: danish, dutch, english, finnish, french, german, hungarian, - * italian, norwegian, portuguese, russian, spanish, swedish, turkish - * @see [[http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/]] - */ + * Load stop words for the language + * Supported languages: danish, dutch, english, finnish, french, german, hungarian, + * italian, norwegian, portuguese, russian, spanish, swedish, turkish + * @see [[http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/]] + */ @Since("2.0.0") def loadStopWords(language: String): Array[String] = { require(supportedLanguages.contains(language), @@ -142,4 +142,4 @@ object StopWordsRemover extends DefaultParamsReadable[StopWordsRemover] { val is = getClass.getResourceAsStream(s"/org/apache/spark/ml/feature/stopwords/$language.txt") scala.io.Source.fromInputStream(is)(scala.io.Codec.UTF8).getLines().toArray } -} \ No newline at end of file +} diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala index a38f68492e9f1..99b353addb1f2 100755 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala @@ -150,4 +150,4 @@ class StopWordsRemoverSuite } assert(thrown.getMessage == s"requirement failed: Column $outputCol already exists.") } -} \ No newline at end of file +} diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 2af31fc336e04..6d9b3c766366e 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -1749,15 +1749,18 @@ def __init__(self, inputCol=None, outputCol=None, stopWords=None, super(StopWordsRemover, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StopWordsRemover", self.uid) - self._setDefault(stopWords=StopWordsRemover.loadStopWords("english"), caseSensitive=False, locale="en") + self._setDefault(stopWords=StopWordsRemover.loadStopWords("english"), + caseSensitive=False, locale="en") kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @keyword_only @since("1.6.0") - def setParams(self, inputCol=None, outputCol=None, stopWords=None, caseSensitive=False, locale="en"): + def setParams(self, inputCol=None, outputCol=None, stopWords=None, + caseSensitive=False, locale="en"): """ - setParams(self, inputCol="input", outputCol="output", stopWords=None, caseSensitive=false, locale="en") + setParams(self, inputCol="input", outputCol="output", stopWords=None, + caseSensitive=false, locale="en") Sets params for this StopWordRemover. """ kwargs = self.setParams._input_kwargs From 01471ec2a74ff86dfaa417509d0f90e2db80b768 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Wed, 4 May 2016 21:41:13 +0300 Subject: [PATCH 21/22] address feedback --- .../spark/ml/feature/StopWordsRemover.scala | 11 ++++---- .../ml/feature/StopWordsRemoverSuite.scala | 13 +++++++-- python/pyspark/ml/feature.py | 28 +++++++++---------- python/pyspark/ml/tests.py | 4 +-- 4 files changed, 32 insertions(+), 24 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala index f9efb7a789a05..545940edcddc0 100755 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala @@ -49,7 +49,7 @@ class StopWordsRemover(override val uid: String) /** * The words to be filtered out. * Default: English stop words - * @see [[StopWordsRemover.loadStopWords()]] + * @see [[StopWordsRemover.loadDefaultStopWords()]] * @group param */ val stopWords: StringArrayParam = @@ -89,7 +89,7 @@ class StopWordsRemover(override val uid: String) /** @group getParam */ def getLocale: String = $(locale) - setDefault(stopWords -> StopWordsRemover.loadStopWords("english"), + setDefault(stopWords -> StopWordsRemover.loadDefaultStopWords("english"), caseSensitive -> false, locale -> "en") @Since("2.0.0") @@ -123,20 +123,21 @@ object StopWordsRemover extends DefaultParamsReadable[StopWordsRemover] { private def loadLocale(value : String) = new Locale(value) - private val supportedLanguages = Set("danish", "dutch", "english", "finnish", "french", "german", + private[feature] + val supportedLanguages = Set("danish", "dutch", "english", "finnish", "french", "german", "hungarian", "italian", "norwegian", "portuguese", "russian", "spanish", "swedish", "turkish") @Since("1.6.0") override def load(path: String): StopWordsRemover = super.load(path) /** - * Load stop words for the language + * Loads the default stop words for the given language. * Supported languages: danish, dutch, english, finnish, french, german, hungarian, * italian, norwegian, portuguese, russian, spanish, swedish, turkish * @see [[http://anoncvs.postgresql.org/cvsweb.cgi/pgsql/src/backend/snowball/stopwords/]] */ @Since("2.0.0") - def loadStopWords(language: String): Array[String] = { + def loadDefaultStopWords(language: String): Array[String] = { require(supportedLanguages.contains(language), s"$language is not in the supported language list: ${supportedLanguages.mkString(", ")}.") val is = getClass.getResourceAsStream(s"/org/apache/spark/ml/feature/stopwords/$language.txt") diff --git a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala index 99b353addb1f2..8e7e000fbc112 100755 --- a/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala +++ b/mllib/src/test/scala/org/apache/spark/ml/feature/StopWordsRemoverSuite.scala @@ -85,8 +85,15 @@ class StopWordsRemoverSuite testStopWordsRemover(remover, dataSet) } + test("default stop words of supported languages are not empty") { + StopWordsRemover.supportedLanguages.foreach { lang => + assert(StopWordsRemover.loadDefaultStopWords(lang).nonEmpty, + s"The default stop words of $lang cannot be empty.") + } + } + test("StopWordsRemover with language selection") { - val stopWords = StopWordsRemover.loadStopWords("turkish") + val stopWords = StopWordsRemover.loadDefaultStopWords("turkish") val remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol("filtered") @@ -100,7 +107,7 @@ class StopWordsRemoverSuite } test("StopWordsRemover with ignored words") { - val stopWords = StopWordsRemover.loadStopWords("english").toSet -- Set("a") + val stopWords = StopWordsRemover.loadDefaultStopWords("english").toSet -- Set("a") val remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol("filtered") @@ -114,7 +121,7 @@ class StopWordsRemoverSuite } test("StopWordsRemover with additional words") { - val stopWords = StopWordsRemover.loadStopWords("english").toSet ++ Set("python", "scala") + val stopWords = StopWordsRemover.loadDefaultStopWords("english").toSet ++ Set("python", "scala") val remover = new StopWordsRemover() .setInputCol("raw") .setOutputCol("filtered") diff --git a/python/pyspark/ml/feature.py b/python/pyspark/ml/feature.py index 6d9b3c766366e..389b358797581 100755 --- a/python/pyspark/ml/feature.py +++ b/python/pyspark/ml/feature.py @@ -1743,13 +1743,13 @@ class StopWordsRemover(JavaTransformer, HasInputCol, HasOutputCol, JavaMLReadabl def __init__(self, inputCol=None, outputCol=None, stopWords=None, caseSensitive=False, locale="en"): """ - __init__(self, inputCol=None, outputCol=None, stopWords=None,\ - caseSensitive=false, locale="en") + __init__(self, inputCol=None, outputCol=None, stopWords=None, \ + caseSensitive=false, locale="en") """ super(StopWordsRemover, self).__init__() self._java_obj = self._new_java_obj("org.apache.spark.ml.feature.StopWordsRemover", self.uid) - self._setDefault(stopWords=StopWordsRemover.loadStopWords("english"), + self._setDefault(stopWords=StopWordsRemover.loadDefaultStopWords("english"), caseSensitive=False, locale="en") kwargs = self.__init__._input_kwargs self.setParams(**kwargs) @@ -1759,8 +1759,8 @@ def __init__(self, inputCol=None, outputCol=None, stopWords=None, def setParams(self, inputCol=None, outputCol=None, stopWords=None, caseSensitive=False, locale="en"): """ - setParams(self, inputCol="input", outputCol="output", stopWords=None, - caseSensitive=false, locale="en") + setParams(self, inputCol="input", outputCol="output", stopWords=None, \ + caseSensitive=false, locale="en") Sets params for this StopWordRemover. """ kwargs = self.setParams._input_kwargs @@ -1769,35 +1769,35 @@ def setParams(self, inputCol=None, outputCol=None, stopWords=None, @since("1.6.0") def setStopWords(self, value): """ - Specify the stopwords to be filtered. + Sets the value of :py:attr:`stopWords`. """ return self._set(stopWords=value) @since("1.6.0") def getStopWords(self): """ - Get the stopwords. + Gets the value of :py:attr:`stopWords` or its default value. """ return self.getOrDefault(self.stopWords) @since("1.6.0") def setCaseSensitive(self, value): """ - Set whether to do a case sensitive comparison over the stop words + Sets the value of :py:attr:`caseSensitive`. """ return self._set(caseSensitive=value) @since("1.6.0") def getCaseSensitive(self): """ - Get whether to do a case sensitive comparison over the stop words. + Gets the value of :py:attr:`caseSensitive` or its default value. """ return self.getOrDefault(self.caseSensitive) @since("2.0.0") def setLocale(self, value): """ - Set locale for doing a case sensitive comparison + Sets the value of :py:attr:`locale`. """ self._set(caseSensitive=value) return self @@ -1805,20 +1805,20 @@ def setLocale(self, value): @since("2.0.0") def getLocale(self): """ - Get locale for doing a case sensitive comparison + Gets the value of :py:attr:`locale`. """ return self.getOrDefault(self.caseSensitive) @staticmethod @since("2.0.0") - def loadStopWords(language): + def loadDefaultStopWords(language): """ - Load stop words for the language + Loads the default stop words for the given language. Supported languages: danish, dutch, english, finnish, french, german, hungarian, italian, norwegian, portuguese, russian, spanish, swedish, turkish """ stopWordsObj = _jvm().org.apache.spark.ml.feature.StopWordsRemover - return list(stopWordsObj.loadStopWords(language)) + return list(stopWordsObj.loadDefaultStopWords(language)) @inherit_doc diff --git a/python/pyspark/ml/tests.py b/python/pyspark/ml/tests.py index 85a75dc16a52d..88be3377dd5b5 100755 --- a/python/pyspark/ml/tests.py +++ b/python/pyspark/ml/tests.py @@ -410,7 +410,7 @@ def test_stopwordsremover(self): self.assertEqual(transformedDF.head().output, ["panda"]) self.assertEqual(type(stopWordRemover.getStopWords()), list) self.assertTrue(isinstance(stopWordRemover.getStopWords()[0], basestring)) - # with particular stop words list + # Custom stopwords = ["panda"] stopWordRemover.setStopWords(stopwords) self.assertEqual(stopWordRemover.getInputCol(), "input") @@ -418,7 +418,7 @@ def test_stopwordsremover(self): transformedDF = stopWordRemover.transform(dataset) self.assertEqual(transformedDF.head().output, ["a"]) # with language selection - stopwords = StopWordsRemover.loadStopWords("turkish") + stopwords = StopWordsRemover.loadDefaultStopWords("turkish") dataset = sqlContext.createDataFrame([Row(input=["acaba", "ama", "biri"])]) stopWordRemover.setStopWords(stopwords) self.assertEqual(stopWordRemover.getStopWords(), stopwords) From dec0634a574124ab53c706b14982a6c81a282c97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Burak=20K=C3=B6se?= Date: Thu, 5 May 2016 00:29:45 +0300 Subject: [PATCH 22/22] fix locale --- .../spark/ml/feature/StopWordsRemover.scala | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala index 545940edcddc0..0091175e77f83 100755 --- a/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala +++ b/mllib/src/main/scala/org/apache/spark/ml/feature/StopWordsRemover.scala @@ -94,15 +94,18 @@ class StopWordsRemover(override val uid: String) @Since("2.0.0") override def transform(dataset: Dataset[_]): DataFrame = { - val stopWordsSet = if ($(caseSensitive)) { - $(stopWords).toSet + val outputSchema = transformSchema(dataset.schema) + val t = if ($(caseSensitive)) { + val stopWordsSet = $(stopWords).toSet + udf { terms: Seq[String] => + terms.filterNot(stopWordsSet.contains) + } } else { val loadedLocale = StopWordsRemover.loadLocale($(locale)) - $(stopWords).filterNot(_ == null).map(_.toLowerCase(loadedLocale)).toSet - } - val outputSchema = transformSchema(dataset.schema) - val t = udf { terms: Seq[String] => - terms.filterNot(stopWordsSet.contains) + val stopWordsSet = $(stopWords).filterNot(_ == null).map(_.toLowerCase(loadedLocale)).toSet + udf { terms: Seq[String] => + terms.filterNot(term => stopWordsSet.contains(term.toLowerCase(loadedLocale))) + } } val metadata = outputSchema($(outputCol)).metadata dataset.select(col("*"), t(col($(inputCol))).as($(outputCol), metadata))