-
Notifications
You must be signed in to change notification settings - Fork 259
/
test.py
58 lines (47 loc) · 2.15 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
"""
Run chardet on a bunch of documents and see that we get the correct encodings.
:author: Dan Blanchard
:author: Ian Cordasco
"""
from __future__ import with_statement
from os import listdir
from os.path import dirname, isdir, join, realpath, relpath, splitext
from nose.tools import eq_
import chardet
EQUIVALENT_ENCODINGS = {'latin1': 'windows-1252'}
# TODO: Restore Hungarian encodings (iso-8859-2 and windows-1250) after we
# retrain model.
MISSING_ENCODINGS = set(['iso-8859-2', 'iso-8859-6', 'windows-1250',
'windows-1254', 'windows-1256'])
def check_file_encoding(file_name, encoding):
""" Ensure that we detect the encoding for file_name correctly. """
with open(file_name, 'rb') as f:
result = chardet.detect(f.read())
encoding = EQUIVALENT_ENCODINGS.get(encoding, encoding)
eq_(result['encoding'].lower(), encoding, ("Expected %s, but got %s for "
"%s" % (encoding,
result['encoding'],
file_name)))
def test_encoding_detection():
base_path = relpath(join(dirname(realpath(__file__)), 'tests'))
for encoding in listdir(base_path):
path = join(base_path, encoding)
# Skip files in tests directory
if not isdir(path):
continue
# Remove language suffixes from encoding if pressent
encoding = encoding.lower()
for postfix in ['-arabic', '-bulgarian', '-cyrillic', '-greek',
'-hebrew', '-hungarian', '-turkish']:
if encoding.endswith(postfix):
encoding = encoding.rpartition(postfix)[0]
break
# Skip directories for encodings we don't handle yet.
if encoding in MISSING_ENCODINGS:
continue
# Test encoding detection for each file we have of encoding for
for file_name in listdir(path):
ext = splitext(file_name)[1].lower()
if ext not in ['.html', '.txt', '.xml', '.srt']:
continue
yield check_file_encoding, join(path, file_name), encoding