Skip to content

Commit 3d11564

Browse files
Force 'utf-8' encoding without relying on platform-dependent default
On Windows, the default encoding is 'cp1252' and this raises a UnicodeDecodeError. Fix fastai#5
1 parent 85e5052 commit 3d11564

File tree

1 file changed

+3
-3
lines changed

1 file changed

+3
-3
lines changed

Diff for: nlputils.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def get_wiki(path,lang):
2626
shutil.rmtree(path/'text')
2727

2828

29-
def split_wiki(path,lang):
29+
def split_wiki(path,lang,encoding='utf-8'):
3030
dest = path/'docs'
3131
name = f'{lang}wiki'
3232
if dest.exists():
@@ -35,7 +35,7 @@ def split_wiki(path,lang):
3535

3636
dest.mkdir(exist_ok=True, parents=True)
3737
title_re = re.compile(rf'<doc id="\d+" url="https://{lang}.wikipedia.org/wiki\?curid=\d+" title="([^"]+)">')
38-
lines = (path/name).open()
38+
lines = (path/name).open(encoding=encoding)
3939
f=None
4040

4141
for i,l in enumerate(lines):
@@ -44,7 +44,7 @@ def split_wiki(path,lang):
4444
title = title_re.findall(l)[0].replace('/','_')
4545
if len(title)>150: continue
4646
if f: f.close()
47-
f = (dest/f'{title}.txt').open('w')
47+
f = (dest/f'{title}.txt').open('w', encoding=encoding)
4848
else: f.write(l)
4949
f.close()
5050
return dest

0 commit comments

Comments
 (0)