Switch to using enums instead of constants, and a bunch of cleanup st…

…uff. Clean up stuff includes: - Stop importing chardet from setup.py now that we have a dependency (enum34 on everything other than Python 3.4). - Add a lot of notes to NOTES.rst about how chardet actually works. - Removed sections of frequency rank tables that we do not actually use. It was just wasting memory. - Removed "m" prefix from attributes all over. Will fix snake case and things like that in a later commit. - Added a lot of comments to UniversalDetector. - Added the ability to ignore certain encodings when running unit tests. This was necessary because we don't actually support some of the encodings we were being tested on! - Removed constants.py now that we have enums.py - Switched to using logging instead of printing to sys.stderr. This actually should help a lot for debugging failed unit tests. - Made a CLI sub-package for chardetect. In the future, we'll reorganize more things into sub-packages.
chardet · dan-blanchard · Jan 9, 2015 · Oct 11, 2014 · Oct 11, 2014 · Dec 1, 2014
commit be09612a5779a51695c5a69b7b73fd1b4ba12a3e
diff --git a/NOTES.rst b/NOTES.rst
@@ -1,4 +1,122 @@
-This is just a collection of information that I've found useful or thought 
+Class Hierarchy for chardet
+===========================
+
+Universal Detector
+------------------
+Has a list of probers.
+
+CharSetProber
+-------------
+Mostly abstract parent class.
+
+CharSetGroupProber
+------------------
+Runs a bunch of related probers at the same time and decides which is best.
+
+SBCSGroupProber
+---------------
+SBCS = Single-ByteCharSet. Runs a bunch of SingleByteCharSetProbers.  Always
+contains the same SingleByteCharSetProbers.
+
+SingleByteCharSetProber
+-----------------------
+A CharSetProber that is used for detecting single-byte encodings by using
+a "precedence matrix" (i.e., a character bigram model).
+
+MBCSGroupProber
+---------------
+Runs a bunch of MultiByteCharSetProbers. It also uses a UTF8Prober, which is
+essentially a MultiByteCharSetProber that only has a state machine.  Always
+contains the same MultiByteCharSetProbers.
+
+MultiByteCharSetProber
+----------------------
+A CharSetProber that uses both a character unigram model (or "character
+distribution analysis") and an independent state machine for trying to
+detect and encoding.
+
+CodingStateMachine
+------------------
+Used for "coding scheme" detection, where we just look for either invalid
+byte sequences or sequences that only occur for that particular encoding.
+
+CharDistributionAnalysis
+------------------------
+Used for character unigram distribution encoding detection.  Takes a mapping
+from characters to a "frequency order" (i.e., what frequency rank that byte has
+in the given encoding) and a "typical distribution ratio", which is the number
+of occurrences of the 512 most frequently used characters divided by the number
+of occurrences of the rest of the characters for a typical document.
+The "characters" in this case are 2-byte sequences and they are first converted
+to an "order" (name comes from ord() function, I believe). This "order" is used
+to index into the frequency order table to determine the frequency rank of that
+byte sequence.  The reason this extra step is necessary is that the frequency
+rank table is language-specific (and not encoding-specific).
+
+
+What's where
+============
+
+
+Bigram files
+------------
+- hebrewprober.py
+- jpcntxprober.py
+- langbulgarianmodel.py
+- langcyrillicmodel.py
+- langgreekmodel.py
+- langhebrewmodel.py
+- langhungarianmodel.py
+- langthaimodel.py
+- latin1prober.py
+- sbcharsetprober.py
+- sbcsgroupprober.py
+
+
+Coding Scheme files
+-------------------
+- escprober.py
+- escsm.py
+- utf8prober.py
+- codingstatemachine.py
+- mbcssmprober.py
+
+
+Unigram files
+-------------
+- big5freqprober.py
+- chardistribution.py
+- euckrfreqprober.py
+- euctwfreqprober.py
+- gb2312freqprober.py
+- jisfreqprober.py
+
+Multibyte probers
+-----------------
+- big5prober.py
+- cp949prober.py
+- eucjpprober.py
+- euckrprober.py
+- euctwprober.py
+- gb2312prober.py
+- mbcharsetprober.py
+- mbcsgroupprober.py
+- sjisprober.py
+
+Misc files
+----------
+- __init__.py (currently has detect function in it)
+- compat.py
+- enums.py
+- universaldetector.py
+- version.py
+
+
+Useful links
+============
+
+
+This is just a collection of information that I've found useful or thought
 might be useful in the future:
 
 - `BOM by Encoding`_
@@ -8,8 +126,8 @@ might be useful in the future:
 - `What Every Programmer Absolutely...`_
 
 - The actual `source`_
-  
-  
+
+
 .. _BOM by Encoding:
     https://en.wikipedia.org/wiki/Byte_order_mark#Representations_of_byte_order_marks_by_encoding
 .. _A Composite Approach to Language/Encoding Detection:

diff --git a/chardet/__init__.py b/chardet/__init__.py
@@ -15,20 +15,17 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################
 
-__version__ = "2.3.0"
-from sys import version_info
 
 from .compat import PY2, PY3
+from .universaldetector import UniversalDetector
 
 
 def detect(aBuf):
     if (PY2 and isinstance(aBuf, unicode)) or (PY3 and
                                                not isinstance(aBuf, bytes)):
         raise ValueError('Expected a bytes object, not a unicode object')
 
-    from . import universaldetector
-    u = universaldetector.UniversalDetector()
-    u.reset()
+    u = UniversalDetector()
     u.feed(aBuf)
     u.close()
     return u.result
diff --git a/chardet/big5freq.py b/chardet/big5freq.py
diff --git a/chardet/big5prober.py b/chardet/big5prober.py
@@ -34,8 +34,8 @@
 class Big5Prober(MultiByteCharSetProber):
     def __init__(self):
         super(Big5Prober, self).__init__()
-        self._mCodingSM = CodingStateMachine(Big5SMModel)
-        self._mDistributionAnalyzer = Big5DistributionAnalysis()
+        self._CodingSM = CodingStateMachine(Big5SMModel)
+        self._DistributionAnalyzer = Big5DistributionAnalysis()
         self.reset()
 
     def get_charset_name(self):

diff --git a/chardet/chardistribution.py b/chardet/chardistribution.py
@@ -48,26 +48,26 @@ class CharDistributionAnalysis(object):
     def __init__(self):
         # Mapping table to get frequency order from char order (get from
         # GetOrder())
-        self._mCharToFreqOrder = None
-        self._mTableSize = None  # Size of above table
+        self._CharToFreqOrder = None
+        self._TableSize = None  # Size of above table
         # This is a constant value which varies from language to language,
         # used in calculating confidence.  See
         # http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html
         # for further detail.
-        self._mTypicalDistributionRatio = None
-        self._mDone = None
-        self._mTotalChars = None
-        self._mFreqChars = None
+        self._TypicalDistributionRatio = None
+        self._Done = None
+        self._TotalChars = None
+        self._FreqChars = None
         self.reset()
 
     def reset(self):
         """reset analyser, clear any state"""
         # If this flag is set to True, detection is done and conclusion has
         # been made
-        self._mDone = False
-        self._mTotalChars = 0  # Total characters encountered
+        self._Done = False
+        self._TotalChars = 0  # Total characters encountered
         # The number of characters whose frequency order is less than 512
-        self._mFreqChars = 0
+        self._FreqChars = 0
 
     def feed(self, aBuf, aCharLen):
         """feed a character with known length"""
@@ -77,22 +77,22 @@ def feed(self, aBuf, aCharLen):
         else:
             order = -1
         if order >= 0:
-            self._mTotalChars += 1
+            self._TotalChars += 1
             # order is valid
-            if order < self._mTableSize:
-                if 512 > self._mCharToFreqOrder[order]:
-                    self._mFreqChars += 1
+            if order < self._TableSize:
+                if 512 > self._CharToFreqOrder[order]:
+                    self._FreqChars += 1
 
     def get_confidence(self):
         """return confidence based on existing data"""
         # if we didn't receive any character in our consideration range,
         # return negative answer
-        if self._mTotalChars <= 0 or self._mFreqChars <= MINIMUM_DATA_THRESHOLD:
+        if self._TotalChars <= 0 or self._FreqChars <= MINIMUM_DATA_THRESHOLD:
             return SURE_NO
 
-        if self._mTotalChars != self._mFreqChars:
-            r = (self._mFreqChars / ((self._mTotalChars - self._mFreqChars)
-                 * self._mTypicalDistributionRatio))
+        if self._TotalChars != self._FreqChars:
+            r = (self._FreqChars / ((self._TotalChars - self._FreqChars)
+                 * self._TypicalDistributionRatio))
             if r < SURE_YES:
                 return r
 
@@ -102,7 +102,7 @@ def get_confidence(self):
     def got_enough_data(self):
         # It is not necessary to receive all data to draw conclusion.
         # For charset detection, certain amount of data is enough
-        return self._mTotalChars > ENOUGH_DATA_THRESHOLD
+        return self._TotalChars > ENOUGH_DATA_THRESHOLD
 
     def get_order(self, aBuf):
         # We do not handle characters based on the original encoding string,
@@ -115,9 +115,9 @@ def get_order(self, aBuf):
 class EUCTWDistributionAnalysis(CharDistributionAnalysis):
     def __init__(self):
         super(EUCTWDistributionAnalysis, self).__init__()
-        self._mCharToFreqOrder = EUCTWCharToFreqOrder
-        self._mTableSize = EUCTW_TABLE_SIZE
-        self._mTypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
+        self._CharToFreqOrder = EUCTWCharToFreqOrder
+        self._TableSize = EUCTW_TABLE_SIZE
+        self._TypicalDistributionRatio = EUCTW_TYPICAL_DISTRIBUTION_RATIO
 
     def get_order(self, aBuf):
         # for euc-TW encoding, we are interested
@@ -134,9 +134,9 @@ def get_order(self, aBuf):
 class EUCKRDistributionAnalysis(CharDistributionAnalysis):
     def __init__(self):
         super(EUCKRDistributionAnalysis, self).__init__()
-        self._mCharToFreqOrder = EUCKRCharToFreqOrder
-        self._mTableSize = EUCKR_TABLE_SIZE
-        self._mTypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
+        self._CharToFreqOrder = EUCKRCharToFreqOrder
+        self._TableSize = EUCKR_TABLE_SIZE
+        self._TypicalDistributionRatio = EUCKR_TYPICAL_DISTRIBUTION_RATIO
 
     def get_order(self, aBuf):
         # for euc-KR encoding, we are interested
@@ -153,9 +153,9 @@ def get_order(self, aBuf):
 class GB2312DistributionAnalysis(CharDistributionAnalysis):
     def __init__(self):
         super(GB2312DistributionAnalysis, self).__init__()
-        self._mCharToFreqOrder = GB2312CharToFreqOrder
-        self._mTableSize = GB2312_TABLE_SIZE
-        self._mTypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO
+        self._CharToFreqOrder = GB2312CharToFreqOrder
+        self._TableSize = GB2312_TABLE_SIZE
+        self._TypicalDistributionRatio = GB2312_TYPICAL_DISTRIBUTION_RATIO
 
     def get_order(self, aBuf):
         # for GB2312 encoding, we are interested
@@ -172,9 +172,9 @@ def get_order(self, aBuf):
 class Big5DistributionAnalysis(CharDistributionAnalysis):
     def __init__(self):
         super(Big5DistributionAnalysis, self).__init__()
-        self._mCharToFreqOrder = Big5CharToFreqOrder
-        self._mTableSize = BIG5_TABLE_SIZE
-        self._mTypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO
+        self._CharToFreqOrder = Big5CharToFreqOrder
+        self._TableSize = BIG5_TABLE_SIZE
+        self._TypicalDistributionRatio = BIG5_TYPICAL_DISTRIBUTION_RATIO
 
     def get_order(self, aBuf):
         # for big5 encoding, we are interested
@@ -194,9 +194,9 @@ def get_order(self, aBuf):
 class SJISDistributionAnalysis(CharDistributionAnalysis):
     def __init__(self):
         super(SJISDistributionAnalysis, self).__init__()
-        self._mCharToFreqOrder = JISCharToFreqOrder
-        self._mTableSize = JIS_TABLE_SIZE
-        self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
+        self._CharToFreqOrder = JISCharToFreqOrder
+        self._TableSize = JIS_TABLE_SIZE
+        self._TypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
 
     def get_order(self, aBuf):
         # for sjis encoding, we are interested
@@ -219,9 +219,9 @@ def get_order(self, aBuf):
 class EUCJPDistributionAnalysis(CharDistributionAnalysis):
     def __init__(self):
         super(EUCJPDistributionAnalysis, self).__init__()
-        self._mCharToFreqOrder = JISCharToFreqOrder
-        self._mTableSize = JIS_TABLE_SIZE
-        self._mTypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
+        self._CharToFreqOrder = JISCharToFreqOrder
+        self._TableSize = JIS_TABLE_SIZE
+        self._TypicalDistributionRatio = JIS_TYPICAL_DISTRIBUTION_RATIO
 
     def get_order(self, aBuf):
         # for euc-JP encoding, we are interested