Skip to content

Commit 83ed7ce

Browse files
very simple tokeniser
1 parent 0a91e41 commit 83ed7ce

File tree

1 file changed

+31
-0
lines changed

1 file changed

+31
-0
lines changed

tokenise.py

+31
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import re
2+
3+
def tokenise_words(sentence):
4+
'''
5+
Returns a conllu object representing the sentence tokenised according to the UD Scottish Gaelic rules
6+
Hyphens are not word boundaries.
7+
Apostrophes aren't word boundaries either.
8+
'''
9+
token = ""
10+
token_id = 0
11+
for i, character in enumerate(sentence):
12+
if character in " ":
13+
if token != "":
14+
yield (token, "_")
15+
token = ""
16+
elif character in ".,[]()":
17+
if token != "":
18+
yield (token, "SpaceAfter=None")
19+
if i < len(sentence) - 1 and sentence[i] != " ":
20+
yield (character, "SpaceAfter=None")
21+
else:
22+
yield (character, "_")
23+
token = ""
24+
else:
25+
token = token + character
26+
if token != "":
27+
yield (token, "_")
28+
29+
print([t for t in tokenise_words("Am falbh thu leam")])
30+
print([t for t in tokenise_words("[Rugadh DOMHNALL MAC EACHARNA air latha Nollaig anns a' bhliadhna 1836, ann an Gleann-garasdail an taobh tuath eilein Dhiùra laimh ri Coire Bhreacain.")])
31+
print([t for t in tokenise_words("Thigeadh iad an sin 'n an ruith 's 'n an leum a dh' fhaicinn ciod an tubaist a dh' éirich do'n bhuachaille.")])

0 commit comments

Comments
 (0)