diff --git a/bench.py b/bench.py index 2761e19f..c8e6cd07 100644 --- a/bench.py +++ b/bench.py @@ -5,13 +5,11 @@ :author: Ian Cordasco """ -from __future__ import print_function, with_statement import argparse import sys import time from collections import defaultdict -from io import open from os import listdir from os.path import dirname, isdir, join, realpath, relpath, splitext diff --git a/chardet/big5prober.py b/chardet/big5prober.py index 98f99701..8d2eda0f 100644 --- a/chardet/big5prober.py +++ b/chardet/big5prober.py @@ -33,7 +33,7 @@ class Big5Prober(MultiByteCharSetProber): def __init__(self): - super(Big5Prober, self).__init__() + super().__init__() self.coding_sm = CodingStateMachine(BIG5_SM_MODEL) self.distribution_analyzer = Big5DistributionAnalysis() self.reset() diff --git a/chardet/chardistribution.py b/chardet/chardistribution.py index c0395f4a..91ad72a4 100644 --- a/chardet/chardistribution.py +++ b/chardet/chardistribution.py @@ -37,7 +37,7 @@ JIS_TYPICAL_DISTRIBUTION_RATIO) -class CharDistributionAnalysis(object): +class CharDistributionAnalysis: ENOUGH_DATA_THRESHOLD = 1024 SURE_YES = 0.99 SURE_NO = 0.01 @@ -112,7 +112,7 @@ def get_order(self, byte_str): class EUCTWDistributionAnalysis(CharDistributionAnalysis): def __init__(self): - super(EUCTWDistributionAnalysis, self).__init__() + super().__init__() self._char_to_freq_order = EUCTW_CHAR_TO_FREQ_ORDER self._table_size = EUCTW_TABLE_SIZE self.typical_distribution_ratio = EUCTW_TYPICAL_DISTRIBUTION_RATIO @@ -131,7 +131,7 @@ def get_order(self, byte_str): class EUCKRDistributionAnalysis(CharDistributionAnalysis): def __init__(self): - super(EUCKRDistributionAnalysis, self).__init__() + super().__init__() self._char_to_freq_order = EUCKR_CHAR_TO_FREQ_ORDER self._table_size = EUCKR_TABLE_SIZE self.typical_distribution_ratio = EUCKR_TYPICAL_DISTRIBUTION_RATIO @@ -150,7 +150,7 @@ def get_order(self, byte_str): class GB2312DistributionAnalysis(CharDistributionAnalysis): def __init__(self): - super(GB2312DistributionAnalysis, self).__init__() + super().__init__() self._char_to_freq_order = GB2312_CHAR_TO_FREQ_ORDER self._table_size = GB2312_TABLE_SIZE self.typical_distribution_ratio = GB2312_TYPICAL_DISTRIBUTION_RATIO @@ -169,7 +169,7 @@ def get_order(self, byte_str): class Big5DistributionAnalysis(CharDistributionAnalysis): def __init__(self): - super(Big5DistributionAnalysis, self).__init__() + super().__init__() self._char_to_freq_order = BIG5_CHAR_TO_FREQ_ORDER self._table_size = BIG5_TABLE_SIZE self.typical_distribution_ratio = BIG5_TYPICAL_DISTRIBUTION_RATIO @@ -191,7 +191,7 @@ def get_order(self, byte_str): class SJISDistributionAnalysis(CharDistributionAnalysis): def __init__(self): - super(SJISDistributionAnalysis, self).__init__() + super().__init__() self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER self._table_size = JIS_TABLE_SIZE self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO @@ -216,7 +216,7 @@ def get_order(self, byte_str): class EUCJPDistributionAnalysis(CharDistributionAnalysis): def __init__(self): - super(EUCJPDistributionAnalysis, self).__init__() + super().__init__() self._char_to_freq_order = JIS_CHAR_TO_FREQ_ORDER self._table_size = JIS_TABLE_SIZE self.typical_distribution_ratio = JIS_TYPICAL_DISTRIBUTION_RATIO diff --git a/chardet/charsetgroupprober.py b/chardet/charsetgroupprober.py index 5812cef0..66d55e08 100644 --- a/chardet/charsetgroupprober.py +++ b/chardet/charsetgroupprober.py @@ -31,13 +31,13 @@ class CharSetGroupProber(CharSetProber): def __init__(self, lang_filter=None): - super(CharSetGroupProber, self).__init__(lang_filter=lang_filter) + super().__init__(lang_filter=lang_filter) self._active_num = 0 self.probers = [] self._best_guess_prober = None def reset(self): - super(CharSetGroupProber, self).reset() + super().reset() self._active_num = 0 for prober in self.probers: if prober: diff --git a/chardet/charsetprober.py b/chardet/charsetprober.py index eac4e598..78143e98 100644 --- a/chardet/charsetprober.py +++ b/chardet/charsetprober.py @@ -32,7 +32,7 @@ from .enums import ProbingState -class CharSetProber(object): +class CharSetProber: SHORTCUT_THRESHOLD = 0.95 diff --git a/chardet/cli/__init__.py b/chardet/cli/__init__.py index 8b137891..e69de29b 100644 --- a/chardet/cli/__init__.py +++ b/chardet/cli/__init__.py @@ -1 +0,0 @@ - diff --git a/chardet/cli/chardetect.py b/chardet/cli/chardetect.py index bd4934ec..ece9eafa 100644 --- a/chardet/cli/chardetect.py +++ b/chardet/cli/chardetect.py @@ -12,7 +12,6 @@ """ -from __future__ import absolute_import, print_function, unicode_literals import argparse import sys diff --git a/chardet/codingstatemachine.py b/chardet/codingstatemachine.py index 68fba44f..3b2439fd 100644 --- a/chardet/codingstatemachine.py +++ b/chardet/codingstatemachine.py @@ -30,7 +30,7 @@ from .enums import MachineState -class CodingStateMachine(object): +class CodingStateMachine: """ A state machine to verify a byte sequence for a particular encoding. For each byte the detector receives, it will feed that byte to every active diff --git a/chardet/compat.py b/chardet/compat.py index 8941572b..8bbdac1c 100644 --- a/chardet/compat.py +++ b/chardet/compat.py @@ -22,15 +22,8 @@ import sys -if sys.version_info < (3, 0): - PY2 = True - PY3 = False - string_types = (str, unicode) - text_type = unicode - iteritems = dict.iteritems -else: - PY2 = False - PY3 = True - string_types = (bytes, str) - text_type = str - iteritems = dict.items +PY2 = False +PY3 = True +string_types = (bytes, str) +text_type = str +iteritems = dict.items diff --git a/chardet/cp949prober.py b/chardet/cp949prober.py index efd793ab..28a1f3db 100644 --- a/chardet/cp949prober.py +++ b/chardet/cp949prober.py @@ -33,7 +33,7 @@ class CP949Prober(MultiByteCharSetProber): def __init__(self): - super(CP949Prober, self).__init__() + super().__init__() self.coding_sm = CodingStateMachine(CP949_SM_MODEL) # NOTE: CP949 is a superset of EUC-KR, so the distribution should be # not different. diff --git a/chardet/enums.py b/chardet/enums.py index 04512072..1b484119 100644 --- a/chardet/enums.py +++ b/chardet/enums.py @@ -5,7 +5,7 @@ """ -class InputState(object): +class InputState: """ This enum represents the different states a universal detector can be in. """ @@ -14,7 +14,7 @@ class InputState(object): HIGH_BYTE = 2 -class LanguageFilter(object): +class LanguageFilter: """ This enum represents the different language filters we can apply to a ``UniversalDetector``. @@ -29,7 +29,7 @@ class LanguageFilter(object): CJK = CHINESE | JAPANESE | KOREAN -class ProbingState(object): +class ProbingState: """ This enum represents the different states a prober can be in. """ @@ -38,7 +38,7 @@ class ProbingState(object): NOT_ME = 2 -class MachineState(object): +class MachineState: """ This enum represents the different states a state machine can be in. """ @@ -47,7 +47,7 @@ class MachineState(object): ITS_ME = 2 -class SequenceLikelihood(object): +class SequenceLikelihood: """ This enum represents the likelihood of a character following the previous one. """ @@ -62,7 +62,7 @@ def get_num_categories(cls): return 4 -class CharacterCategory(object): +class CharacterCategory: """ This enum represents the different categories language models for ``SingleByteCharsetProber`` put characters into. diff --git a/chardet/escprober.py b/chardet/escprober.py index c70493f2..e18bdc76 100644 --- a/chardet/escprober.py +++ b/chardet/escprober.py @@ -40,7 +40,7 @@ class EscCharSetProber(CharSetProber): """ def __init__(self, lang_filter=None): - super(EscCharSetProber, self).__init__(lang_filter=lang_filter) + super().__init__(lang_filter=lang_filter) self.coding_sm = [] if self.lang_filter & LanguageFilter.CHINESE_SIMPLIFIED: self.coding_sm.append(CodingStateMachine(HZ_SM_MODEL)) @@ -56,7 +56,7 @@ def __init__(self, lang_filter=None): self.reset() def reset(self): - super(EscCharSetProber, self).reset() + super().reset() for coding_sm in self.coding_sm: if not coding_sm: continue diff --git a/chardet/eucjpprober.py b/chardet/eucjpprober.py index 20ce8f7d..a2f3ef4e 100644 --- a/chardet/eucjpprober.py +++ b/chardet/eucjpprober.py @@ -35,14 +35,14 @@ class EUCJPProber(MultiByteCharSetProber): def __init__(self): - super(EUCJPProber, self).__init__() + super().__init__() self.coding_sm = CodingStateMachine(EUCJP_SM_MODEL) self.distribution_analyzer = EUCJPDistributionAnalysis() self.context_analyzer = EUCJPContextAnalysis() self.reset() def reset(self): - super(EUCJPProber, self).reset() + super().reset() self.context_analyzer.reset() @property diff --git a/chardet/euckrprober.py b/chardet/euckrprober.py index 345a060d..0c536839 100644 --- a/chardet/euckrprober.py +++ b/chardet/euckrprober.py @@ -33,7 +33,7 @@ class EUCKRProber(MultiByteCharSetProber): def __init__(self): - super(EUCKRProber, self).__init__() + super().__init__() self.coding_sm = CodingStateMachine(EUCKR_SM_MODEL) self.distribution_analyzer = EUCKRDistributionAnalysis() self.reset() diff --git a/chardet/euctwprober.py b/chardet/euctwprober.py index 35669cc4..8c528a33 100644 --- a/chardet/euctwprober.py +++ b/chardet/euctwprober.py @@ -32,7 +32,7 @@ class EUCTWProber(MultiByteCharSetProber): def __init__(self): - super(EUCTWProber, self).__init__() + super().__init__() self.coding_sm = CodingStateMachine(EUCTW_SM_MODEL) self.distribution_analyzer = EUCTWDistributionAnalysis() self.reset() diff --git a/chardet/gb2312prober.py b/chardet/gb2312prober.py index 8446d2dd..b0c3a67b 100644 --- a/chardet/gb2312prober.py +++ b/chardet/gb2312prober.py @@ -32,7 +32,7 @@ class GB2312Prober(MultiByteCharSetProber): def __init__(self): - super(GB2312Prober, self).__init__() + super().__init__() self.coding_sm = CodingStateMachine(GB2312_SM_MODEL) self.distribution_analyzer = GB2312DistributionAnalysis() self.reset() diff --git a/chardet/hebrewprober.py b/chardet/hebrewprober.py index b0e1bf49..61286480 100644 --- a/chardet/hebrewprober.py +++ b/chardet/hebrewprober.py @@ -152,7 +152,7 @@ class HebrewProber(CharSetProber): LOGICAL_HEBREW_NAME = "windows-1255" def __init__(self): - super(HebrewProber, self).__init__() + super().__init__() self._final_char_logical_score = None self._final_char_visual_score = None self._prev = None diff --git a/chardet/jpcntx.py b/chardet/jpcntx.py index 20044e4b..713b7008 100644 --- a/chardet/jpcntx.py +++ b/chardet/jpcntx.py @@ -113,7 +113,7 @@ (0,4,0,3,0,3,0,3,0,3,5,5,3,3,3,3,4,3,4,3,3,3,4,4,4,3,3,3,3,4,3,5,3,3,1,3,2,4,5,5,5,5,4,3,4,5,5,3,2,2,3,3,3,3,2,3,3,1,2,3,2,4,3,3,3,4,0,4,0,2,0,4,3,2,2,1,2,0,3,0,0,4,1), ) -class JapaneseContextAnalysis(object): +class JapaneseContextAnalysis: NUM_OF_CATEGORY = 6 DONT_KNOW = -1 ENOUGH_REL_THRESHOLD = 100 @@ -182,7 +182,7 @@ def get_order(self, byte_str): class SJISContextAnalysis(JapaneseContextAnalysis): def __init__(self): - super(SJISContextAnalysis, self).__init__() + super().__init__() self._charset_name = "SHIFT_JIS" @property diff --git a/chardet/langbulgarianmodel.py b/chardet/langbulgarianmodel.py index 561bfd90..9490af89 100644 --- a/chardet/langbulgarianmodel.py +++ b/chardet/langbulgarianmodel.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- from chardet.sbcharsetprober import SingleByteCharSetModel diff --git a/chardet/langgreekmodel.py b/chardet/langgreekmodel.py index 02b94de6..09bad273 100644 --- a/chardet/langgreekmodel.py +++ b/chardet/langgreekmodel.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- from chardet.sbcharsetprober import SingleByteCharSetModel diff --git a/chardet/langhebrewmodel.py b/chardet/langhebrewmodel.py index 40fd674c..057493b8 100644 --- a/chardet/langhebrewmodel.py +++ b/chardet/langhebrewmodel.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- from chardet.sbcharsetprober import SingleByteCharSetModel diff --git a/chardet/langhungarianmodel.py b/chardet/langhungarianmodel.py index 24a097f5..48c7e5e0 100644 --- a/chardet/langhungarianmodel.py +++ b/chardet/langhungarianmodel.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- from chardet.sbcharsetprober import SingleByteCharSetModel diff --git a/chardet/langrussianmodel.py b/chardet/langrussianmodel.py index 569689d0..fe5f857f 100644 --- a/chardet/langrussianmodel.py +++ b/chardet/langrussianmodel.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- from chardet.sbcharsetprober import SingleByteCharSetModel diff --git a/chardet/langthaimodel.py b/chardet/langthaimodel.py index d0191f24..15023ad4 100644 --- a/chardet/langthaimodel.py +++ b/chardet/langthaimodel.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- from chardet.sbcharsetprober import SingleByteCharSetModel diff --git a/chardet/langturkishmodel.py b/chardet/langturkishmodel.py index 8ba93224..4cb30c75 100644 --- a/chardet/langturkishmodel.py +++ b/chardet/langturkishmodel.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- from chardet.sbcharsetprober import SingleByteCharSetModel diff --git a/chardet/latin1prober.py b/chardet/latin1prober.py index 7d1e8c20..efb99d48 100644 --- a/chardet/latin1prober.py +++ b/chardet/latin1prober.py @@ -95,7 +95,7 @@ class Latin1Prober(CharSetProber): def __init__(self): - super(Latin1Prober, self).__init__() + super().__init__() self._last_char_class = None self._freq_counter = None self.reset() diff --git a/chardet/mbcharsetprober.py b/chardet/mbcharsetprober.py index 6256ecfd..071dc854 100644 --- a/chardet/mbcharsetprober.py +++ b/chardet/mbcharsetprober.py @@ -37,13 +37,13 @@ class MultiByteCharSetProber(CharSetProber): """ def __init__(self, lang_filter=None): - super(MultiByteCharSetProber, self).__init__(lang_filter=lang_filter) + super().__init__(lang_filter=lang_filter) self.distribution_analyzer = None self.coding_sm = None self._last_char = [0, 0] def reset(self): - super(MultiByteCharSetProber, self).reset() + super().reset() if self.coding_sm: self.coding_sm.reset() if self.distribution_analyzer: diff --git a/chardet/mbcsgroupprober.py b/chardet/mbcsgroupprober.py index 530abe75..43ffa41f 100644 --- a/chardet/mbcsgroupprober.py +++ b/chardet/mbcsgroupprober.py @@ -40,7 +40,7 @@ class MBCSGroupProber(CharSetGroupProber): def __init__(self, lang_filter=None): - super(MBCSGroupProber, self).__init__(lang_filter=lang_filter) + super().__init__(lang_filter=lang_filter) self.probers = [ UTF8Prober(), SJISProber(), diff --git a/chardet/metadata/languages.py b/chardet/metadata/languages.py index 3237d5ab..b6c53269 100644 --- a/chardet/metadata/languages.py +++ b/chardet/metadata/languages.py @@ -1,19 +1,17 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- """ Metadata about languages used by our model training code for our SingleByteCharSetProbers. Could be used for other things in the future. This code is based on the language metadata from the uchardet project. """ -from __future__ import absolute_import, print_function from string import ascii_letters # TODO: Add Ukranian (KOI8-U) -class Language(object): +class Language: """Metadata about a language useful for training models :ivar name: The human name for the language, in English. @@ -35,7 +33,7 @@ class Language(object): """ def __init__(self, name=None, iso_code=None, use_ascii=True, charsets=None, alphabet=None, wiki_start_pages=None): - super(Language, self).__init__() + super().__init__() self.name = name self.iso_code = iso_code self.use_ascii = use_ascii @@ -52,7 +50,7 @@ def __init__(self, name=None, iso_code=None, use_ascii=True, charsets=None, def __repr__(self): return '{}({})'.format(self.__class__.__name__, - ', '.join('{}={!r}'.format(k, v) + ', '.join(f'{k}={v!r}' for k, v in self.__dict__.items() if not k.startswith('_'))) @@ -66,70 +64,70 @@ def __repr__(self): # forms. This means we purposefully skip IBM864. charsets=['ISO-8859-6', 'WINDOWS-1256', 'CP720', 'CP864'], - alphabet=u'ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ', - wiki_start_pages=[u'الصفحة_الرئيسية']), + alphabet='ءآأؤإئابةتثجحخدذرزسشصضطظعغػؼؽؾؿـفقكلمنهوىيًٌٍَُِّ', + wiki_start_pages=['الصفحة_الرئيسية']), 'Belarusian': Language(name='Belarusian', iso_code='be', use_ascii=False, charsets=['ISO-8859-5', 'WINDOWS-1251', 'IBM866', 'MacCyrillic'], - alphabet=(u'АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯ' - u'абвгдеёжзійклмнопрстуўфхцчшыьэюяʼ'), - wiki_start_pages=[u'Галоўная_старонка']), + alphabet=('АБВГДЕЁЖЗІЙКЛМНОПРСТУЎФХЦЧШЫЬЭЮЯ' + 'абвгдеёжзійклмнопрстуўфхцчшыьэюяʼ'), + wiki_start_pages=['Галоўная_старонка']), 'Bulgarian': Language(name='Bulgarian', iso_code='bg', use_ascii=False, charsets=['ISO-8859-5', 'WINDOWS-1251', 'IBM855'], - alphabet=(u'АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯ' - u'абвгдежзийклмнопрстуфхцчшщъьюя'), - wiki_start_pages=[u'Начална_страница']), + alphabet=('АБВГДЕЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЬЮЯ' + 'абвгдежзийклмнопрстуфхцчшщъьюя'), + wiki_start_pages=['Начална_страница']), 'Czech': Language(name='Czech', iso_code='cz', use_ascii=True, charsets=['ISO-8859-2', 'WINDOWS-1250'], - alphabet=u'áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ', - wiki_start_pages=[u'Hlavní_strana']), + alphabet='áčďéěíňóřšťúůýžÁČĎÉĚÍŇÓŘŠŤÚŮÝŽ', + wiki_start_pages=['Hlavní_strana']), 'Danish': Language(name='Danish', iso_code='da', use_ascii=True, charsets=['ISO-8859-1', 'ISO-8859-15', 'WINDOWS-1252'], - alphabet=u'æøåÆØÅ', - wiki_start_pages=[u'Forside']), + alphabet='æøåÆØÅ', + wiki_start_pages=['Forside']), 'German': Language(name='German', iso_code='de', use_ascii=True, charsets=['ISO-8859-1', 'WINDOWS-1252'], - alphabet=u'äöüßÄÖÜ', - wiki_start_pages=[u'Wikipedia:Hauptseite']), + alphabet='äöüßÄÖÜ', + wiki_start_pages=['Wikipedia:Hauptseite']), 'Greek': Language(name='Greek', iso_code='el', use_ascii=False, charsets=['ISO-8859-7', 'WINDOWS-1253'], - alphabet=(u'αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώ' - u'ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ'), - wiki_start_pages=[u'Πύλη:Κύρια']), + alphabet=('αβγδεζηθικλμνξοπρσςτυφχψωάέήίόύώ' + 'ΑΒΓΔΕΖΗΘΙΚΛΜΝΞΟΠΡΣΣΤΥΦΧΨΩΆΈΉΊΌΎΏ'), + wiki_start_pages=['Πύλη:Κύρια']), 'English': Language(name='English', iso_code='en', use_ascii=True, charsets=['ISO-8859-1', 'WINDOWS-1252'], - wiki_start_pages=[u'Main_Page']), + wiki_start_pages=['Main_Page']), 'Esperanto': Language(name='Esperanto', iso_code='eo', # Q, W, X, and Y not used at all use_ascii=False, charsets=['ISO-8859-3'], - alphabet=(u'abcĉdefgĝhĥijĵklmnoprsŝtuŭvz' - u'ABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ'), - wiki_start_pages=[u'Vikipedio:Ĉefpaĝo']), + alphabet=('abcĉdefgĝhĥijĵklmnoprsŝtuŭvz' + 'ABCĈDEFGĜHĤIJĴKLMNOPRSŜTUŬVZ'), + wiki_start_pages=['Vikipedio:Ĉefpaĝo']), 'Spanish': Language(name='Spanish', iso_code='es', use_ascii=True, charsets=['ISO-8859-1', 'ISO-8859-15', 'WINDOWS-1252'], - alphabet=u'ñáéíóúüÑÁÉÍÓÚÜ', - wiki_start_pages=[u'Wikipedia:Portada']), + alphabet='ñáéíóúüÑÁÉÍÓÚÜ', + wiki_start_pages=['Wikipedia:Portada']), 'Estonian': Language(name='Estonian', iso_code='et', use_ascii=False, @@ -137,153 +135,153 @@ def __repr__(self): 'WINDOWS-1257'], # C, F, Š, Q, W, X, Y, Z, Ž are only for # loanwords - alphabet=(u'ABDEGHIJKLMNOPRSTUVÕÄÖÜ' - u'abdeghijklmnoprstuvõäöü'), - wiki_start_pages=[u'Esileht']), + alphabet=('ABDEGHIJKLMNOPRSTUVÕÄÖÜ' + 'abdeghijklmnoprstuvõäöü'), + wiki_start_pages=['Esileht']), 'Finnish': Language(name='Finnish', iso_code='fi', use_ascii=True, charsets=['ISO-8859-1', 'ISO-8859-15', 'WINDOWS-1252'], - alphabet=u'ÅÄÖŠŽåäöšž', - wiki_start_pages=[u'Wikipedia:Etusivu']), + alphabet='ÅÄÖŠŽåäöšž', + wiki_start_pages=['Wikipedia:Etusivu']), 'French': Language(name='French', iso_code='fr', use_ascii=True, charsets=['ISO-8859-1', 'ISO-8859-15', 'WINDOWS-1252'], - alphabet=u'œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ', - wiki_start_pages=[u'Wikipédia:Accueil_principal', - u'Bœuf (animal)']), + alphabet='œàâçèéîïùûêŒÀÂÇÈÉÎÏÙÛÊ', + wiki_start_pages=['Wikipédia:Accueil_principal', + 'Bœuf (animal)']), 'Hebrew': Language(name='Hebrew', iso_code='he', use_ascii=False, charsets=['ISO-8859-8', 'WINDOWS-1255'], - alphabet=u'אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ', - wiki_start_pages=[u'עמוד_ראשי']), + alphabet='אבגדהוזחטיךכלםמןנסעףפץצקרשתװױײ', + wiki_start_pages=['עמוד_ראשי']), 'Croatian': Language(name='Croatian', iso_code='hr', # Q, W, X, Y are only used for foreign words. use_ascii=False, charsets=['ISO-8859-2', 'WINDOWS-1250'], - alphabet=(u'abcčćdđefghijklmnoprsštuvzž' - u'ABCČĆDĐEFGHIJKLMNOPRSŠTUVZŽ'), - wiki_start_pages=[u'Glavna_stranica']), + alphabet=('abcčćdđefghijklmnoprsštuvzž' + 'ABCČĆDĐEFGHIJKLMNOPRSŠTUVZŽ'), + wiki_start_pages=['Glavna_stranica']), 'Hungarian': Language(name='Hungarian', iso_code='hu', # Q, W, X, Y are only used for foreign words. use_ascii=False, charsets=['ISO-8859-2', 'WINDOWS-1250'], - alphabet=(u'abcdefghijklmnoprstuvzáéíóöőúüű' - u'ABCDEFGHIJKLMNOPRSTUVZÁÉÍÓÖŐÚÜŰ'), - wiki_start_pages=[u'Kezdőlap']), + alphabet=('abcdefghijklmnoprstuvzáéíóöőúüű' + 'ABCDEFGHIJKLMNOPRSTUVZÁÉÍÓÖŐÚÜŰ'), + wiki_start_pages=['Kezdőlap']), 'Italian': Language(name='Italian', iso_code='it', use_ascii=True, charsets=['ISO-8859-1', 'ISO-8859-15', 'WINDOWS-1252'], - alphabet=u'ÀÈÉÌÒÓÙàèéìòóù', - wiki_start_pages=[u'Pagina_principale']), + alphabet='ÀÈÉÌÒÓÙàèéìòóù', + wiki_start_pages=['Pagina_principale']), 'Lithuanian': Language(name='Lithuanian', iso_code='lt', use_ascii=False, charsets=['ISO-8859-13', 'WINDOWS-1257', 'ISO-8859-4'], # Q, W, and X not used at all - alphabet=(u'AĄBCČDEĘĖFGHIĮYJKLMNOPRSŠTUŲŪVZŽ' - u'aąbcčdeęėfghiįyjklmnoprsštuųūvzž'), - wiki_start_pages=[u'Pagrindinis_puslapis']), + alphabet=('AĄBCČDEĘĖFGHIĮYJKLMNOPRSŠTUŲŪVZŽ' + 'aąbcčdeęėfghiįyjklmnoprsštuųūvzž'), + wiki_start_pages=['Pagrindinis_puslapis']), 'Latvian': Language(name='Latvian', iso_code='lv', use_ascii=False, charsets=['ISO-8859-13', 'WINDOWS-1257', 'ISO-8859-4'], # Q, W, X, Y are only for loanwords - alphabet=(u'AĀBCČDEĒFGĢHIĪJKĶLĻMNŅOPRSŠTUŪVZŽ' - u'aābcčdeēfgģhiījkķlļmnņoprsštuūvzž'), - wiki_start_pages=[u'Sākumlapa']), + alphabet=('AĀBCČDEĒFGĢHIĪJKĶLĻMNŅOPRSŠTUŪVZŽ' + 'aābcčdeēfgģhiījkķlļmnņoprsštuūvzž'), + wiki_start_pages=['Sākumlapa']), 'Macedonian': Language(name='Macedonian', iso_code='mk', use_ascii=False, charsets=['ISO-8859-5', 'WINDOWS-1251', 'MacCyrillic', 'IBM855'], - alphabet=(u'АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШ' - u'абвгдѓежзѕијклљмнњопрстќуфхцчџш'), - wiki_start_pages=[u'Главна_страница']), + alphabet=('АБВГДЃЕЖЗЅИЈКЛЉМНЊОПРСТЌУФХЦЧЏШ' + 'абвгдѓежзѕијклљмнњопрстќуфхцчџш'), + wiki_start_pages=['Главна_страница']), 'Dutch': Language(name='Dutch', iso_code='nl', use_ascii=True, charsets=['ISO-8859-1', 'WINDOWS-1252'], - wiki_start_pages=[u'Hoofdpagina']), + wiki_start_pages=['Hoofdpagina']), 'Polish': Language(name='Polish', iso_code='pl', # Q and X are only used for foreign words. use_ascii=False, charsets=['ISO-8859-2', 'WINDOWS-1250'], - alphabet=(u'AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻ' - u'aąbcćdeęfghijklłmnńoóprsśtuwyzźż'), - wiki_start_pages=[u'Wikipedia:Strona_główna']), + alphabet=('AĄBCĆDEĘFGHIJKLŁMNŃOÓPRSŚTUWYZŹŻ' + 'aąbcćdeęfghijklłmnńoóprsśtuwyzźż'), + wiki_start_pages=['Wikipedia:Strona_główna']), 'Portuguese': Language(name='Portuguese', iso_code='pt', use_ascii=True, charsets=['ISO-8859-1', 'ISO-8859-15', 'WINDOWS-1252'], - alphabet=u'ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú', - wiki_start_pages=[u'Wikipédia:Página_principal']), + alphabet='ÁÂÃÀÇÉÊÍÓÔÕÚáâãàçéêíóôõú', + wiki_start_pages=['Wikipédia:Página_principal']), 'Romanian': Language(name='Romanian', iso_code='ro', use_ascii=True, charsets=['ISO-8859-2', 'WINDOWS-1250'], - alphabet=u'ăâîșțĂÂÎȘȚ', - wiki_start_pages=[u'Pagina_principală']), + alphabet='ăâîșțĂÂÎȘȚ', + wiki_start_pages=['Pagina_principală']), 'Russian': Language(name='Russian', iso_code='ru', use_ascii=False, charsets=['ISO-8859-5', 'WINDOWS-1251', 'KOI8-R', 'MacCyrillic', 'IBM866', 'IBM855'], - alphabet=(u'абвгдеёжзийклмнопрстуфхцчшщъыьэюя' - u'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ'), - wiki_start_pages=[u'Заглавная_страница']), + alphabet=('абвгдеёжзийклмнопрстуфхцчшщъыьэюя' + 'АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯ'), + wiki_start_pages=['Заглавная_страница']), 'Slovak': Language(name='Slovak', iso_code='sk', use_ascii=True, charsets=['ISO-8859-2', 'WINDOWS-1250'], - alphabet=u'áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ', - wiki_start_pages=[u'Hlavná_stránka']), + alphabet='áäčďéíĺľňóôŕšťúýžÁÄČĎÉÍĹĽŇÓÔŔŠŤÚÝŽ', + wiki_start_pages=['Hlavná_stránka']), 'Slovene': Language(name='Slovene', iso_code='sl', # Q, W, X, Y are only used for foreign words. use_ascii=False, charsets=['ISO-8859-2', 'WINDOWS-1250'], - alphabet=(u'abcčdefghijklmnoprsštuvzž' - u'ABCČDEFGHIJKLMNOPRSŠTUVZŽ'), - wiki_start_pages=[u'Glavna_stran']), + alphabet=('abcčdefghijklmnoprsštuvzž' + 'ABCČDEFGHIJKLMNOPRSŠTUVZŽ'), + wiki_start_pages=['Glavna_stran']), # Serbian can be written in both Latin and Cyrillic, but there's no # simple way to get the Latin alphabet pages from Wikipedia through # the API, so for now we just support Cyrillic. 'Serbian': Language(name='Serbian', iso_code='sr', - alphabet=(u'АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШ' - u'абвгдђежзијклљмнњопрстћуфхцчџш'), + alphabet=('АБВГДЂЕЖЗИЈКЛЉМНЊОПРСТЋУФХЦЧЏШ' + 'абвгдђежзијклљмнњопрстћуфхцчџш'), charsets=['ISO-8859-5', 'WINDOWS-1251', 'MacCyrillic', 'IBM855'], - wiki_start_pages=[u'Главна_страна']), + wiki_start_pages=['Главна_страна']), 'Thai': Language(name='Thai', iso_code='th', use_ascii=False, charsets=['ISO-8859-11', 'TIS-620', 'CP874'], - alphabet=u'กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛', - wiki_start_pages=[u'หน้าหลัก']), + alphabet='กขฃคฅฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลฦวศษสหฬอฮฯะัาำิีึืฺุู฿เแโใไๅๆ็่้๊๋์ํ๎๏๐๑๒๓๔๕๖๗๘๙๚๛', + wiki_start_pages=['หน้าหลัก']), 'Turkish': Language(name='Turkish', iso_code='tr', # Q, W, and X are not used by Turkish use_ascii=False, charsets=['ISO-8859-3', 'ISO-8859-9', 'WINDOWS-1254'], - alphabet=(u'abcçdefgğhıijklmnoöprsştuüvyzâîû' - u'ABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZÂÎÛ'), - wiki_start_pages=[u'Ana_Sayfa']), + alphabet=('abcçdefgğhıijklmnoöprsştuüvyzâîû' + 'ABCÇDEFGĞHIİJKLMNOÖPRSŞTUÜVYZÂÎÛ'), + wiki_start_pages=['Ana_Sayfa']), 'Vietnamese': Language(name='Vietnamese', iso_code='vi', use_ascii=False, @@ -304,7 +302,7 @@ def __repr__(self): # the adoption of Unicode on the World Wide # Web. charsets=['WINDOWS-1258'], - alphabet=(u'aăâbcdđeêghiklmnoôơpqrstuưvxy' - u'AĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY'), - wiki_start_pages=[u'Chữ_Quốc_ngữ']), + alphabet=('aăâbcdđeêghiklmnoôơpqrstuưvxy' + 'AĂÂBCDĐEÊGHIKLMNOÔƠPQRSTUƯVXY'), + wiki_start_pages=['Chữ_Quốc_ngữ']), } diff --git a/chardet/sbcharsetprober.py b/chardet/sbcharsetprober.py index 46ba835c..bea8759c 100644 --- a/chardet/sbcharsetprober.py +++ b/chardet/sbcharsetprober.py @@ -49,7 +49,7 @@ class SingleByteCharSetProber(CharSetProber): NEGATIVE_SHORTCUT_THRESHOLD = 0.05 def __init__(self, model, reversed=False, name_prober=None): - super(SingleByteCharSetProber, self).__init__() + super().__init__() self._model = model # TRUE if we need to reverse every pair in the model lookup self._reversed = reversed @@ -63,7 +63,7 @@ def __init__(self, model, reversed=False, name_prober=None): self.reset() def reset(self): - super(SingleByteCharSetProber, self).reset() + super().reset() # char order of last character self._last_order = 255 self._seq_counters = [0] * SequenceLikelihood.get_num_categories() diff --git a/chardet/sbcsgroupprober.py b/chardet/sbcsgroupprober.py index bdeef4e1..7e59e4a5 100644 --- a/chardet/sbcsgroupprober.py +++ b/chardet/sbcsgroupprober.py @@ -45,7 +45,7 @@ class SBCSGroupProber(CharSetGroupProber): def __init__(self): - super(SBCSGroupProber, self).__init__() + super().__init__() hebrew_prober = HebrewProber() logical_hebrew_prober = SingleByteCharSetProber(WINDOWS_1255_HEBREW_MODEL, False, hebrew_prober) diff --git a/chardet/sjisprober.py b/chardet/sjisprober.py index 9e29623b..26292fa3 100644 --- a/chardet/sjisprober.py +++ b/chardet/sjisprober.py @@ -35,14 +35,14 @@ class SJISProber(MultiByteCharSetProber): def __init__(self): - super(SJISProber, self).__init__() + super().__init__() self.coding_sm = CodingStateMachine(SJIS_SM_MODEL) self.distribution_analyzer = SJISDistributionAnalysis() self.context_analyzer = SJISContextAnalysis() self.reset() def reset(self): - super(SJISProber, self).reset() + super().reset() self.context_analyzer.reset() @property diff --git a/chardet/universaldetector.py b/chardet/universaldetector.py index 055a8ac1..bfff375a 100644 --- a/chardet/universaldetector.py +++ b/chardet/universaldetector.py @@ -48,7 +48,7 @@ class a user of ``chardet`` should use. from .sbcsgroupprober import SBCSGroupProber -class UniversalDetector(object): +class UniversalDetector: """ The ``UniversalDetector`` class underlies the ``chardet.detect`` function and coordinates all of the different charset probers. diff --git a/chardet/utf8prober.py b/chardet/utf8prober.py index 6c3196cc..c88e86f1 100644 --- a/chardet/utf8prober.py +++ b/chardet/utf8prober.py @@ -36,13 +36,13 @@ class UTF8Prober(CharSetProber): ONE_CHAR_PROB = 0.5 def __init__(self): - super(UTF8Prober, self).__init__() + super().__init__() self.coding_sm = CodingStateMachine(UTF8_SM_MODEL) self._num_mb_chars = None self.reset() def reset(self): - super(UTF8Prober, self).reset() + super().reset() self.coding_sm.reset() self._num_mb_chars = 0 diff --git a/convert_language_model.py b/convert_language_model.py index ebeb52f5..7892d1ba 100755 --- a/convert_language_model.py +++ b/convert_language_model.py @@ -1,5 +1,4 @@ #!/usr/bin/env python -# -*- coding: utf-8 -*- ######################## BEGIN LICENSE BLOCK ######################## # Contributor(s): @@ -26,12 +25,10 @@ """ Convert old style SBCS model to new """ -from __future__ import absolute_import, print_function import os import sys from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter -from io import open from string import ascii_letters import chardet diff --git a/docs/conf.py b/docs/conf.py index 2038913a..686010cc 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 -# -*- coding: utf-8 -*- # # chardet documentation build configuration file, created by # sphinx-quickstart on Thu Mar 27 00:17:49 2015. diff --git a/test.py b/test.py index e39f28bd..50d26512 100644 --- a/test.py +++ b/test.py @@ -5,11 +5,9 @@ :author: Ian Cordasco """ -from __future__ import with_statement import textwrap from difflib import ndiff -from io import open from os import listdir from os.path import dirname, isdir, join, realpath, relpath, splitext