dcramer · puzzlet · Nov 4, 2012
diff --git a/chardet/chardistribution.py b/chardet/chardistribution.py
@@ -25,7 +25,7 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################
 
-import constants
+from chardet.compat import _bytechar, _byteord
 from euctwfreq import EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE, EUCTW_TYPICAL_DISTRIBUTION_RATIO
 from euckrfreq import EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE, EUCKR_TYPICAL_DISTRIBUTION_RATIO
 from gb2312freq import GB2312CharToFreqOrder, GB2312_TABLE_SIZE, GB2312_TYPICAL_DISTRIBUTION_RATIO
@@ -45,7 +45,7 @@ def __init__(self):
 
     def reset(self):
         """reset analyser, clear any state"""
-        self._mDone = constants.False # If this flag is set to constants.True, detection is done and conclusion has been made
+        self._mDone = False # If this flag is set to True, detection is done and conclusion has been made
         self._mTotalChars = 0 # Total characters encountered
         self._mFreqChars = 0 # The number of characters whose frequency order is less than 512
 
@@ -100,8 +100,8 @@ def get_order(self, aStr):
         #   first  byte range: 0xc4 -- 0xfe
         #   second byte range: 0xa1 -- 0xfe
         # no validation needed here. State machine has done that
-        if aStr[0] >= '\xC4':
-            return 94 * (ord(aStr[0]) - 0xC4) + ord(aStr[1]) - 0xA1
+        if aStr[0] >= _bytechar(0xC4):
+            return 94 * (_byteord(aStr[0]) - 0xC4) + _byteord(aStr[1]) - 0xA1
         else:
             return -1
 
@@ -117,8 +117,8 @@ def get_order(self, aStr):
         #   first  byte range: 0xb0 -- 0xfe
         #   second byte range: 0xa1 -- 0xfe
         # no validation needed here. State machine has done that
-        if aStr[0] >= '\xB0':
-            return 94 * (ord(aStr[0]) - 0xB0) + ord(aStr[1]) - 0xA1
+        if aStr[0] >= _bytechar(0xB0):
+            return 94 * (_byteord(aStr[0]) - 0xB0) + _byteord(aStr[1]) - 0xA1
         else:
             return -1;
 
@@ -134,8 +134,8 @@ def get_order(self, aStr):
         #  first  byte range: 0xb0 -- 0xfe
         #  second byte range: 0xa1 -- 0xfe
         # no validation needed here. State machine has done that
-        if (aStr[0] >= '\xB0') and (aStr[1] >= '\xA1'):
-            return 94 * (ord(aStr[0]) - 0xB0) + ord(aStr[1]) - 0xA1
+        if (aStr[0] >= _bytechar(0xB0)) and (aStr[1] >= _bytechar(0xA1)):
+            return 94 * (_byteord(aStr[0]) - 0xB0) + _byteord(aStr[1]) - 0xA1
         else:
             return -1;
 
@@ -151,11 +151,11 @@ def get_order(self, aStr):
         #   first  byte range: 0xa4 -- 0xfe
         #   second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
         # no validation needed here. State machine has done that
-        if aStr[0] >= '\xA4':
-            if aStr[1] >= '\xA1':
-                return 157 * (ord(aStr[0]) - 0xA4) + ord(aStr[1]) - 0xA1 + 63
+        if aStr[0] >= _bytechar(0xA4):
+            if aStr[1] >= _bytechar(0xA1):
+                return 157 * (_byteord(aStr[0]) - 0xA4) + _byteord(aStr[1]) - 0xA1 + 63
             else:
-                return 157 * (ord(aStr[0]) - 0xA4) + ord(aStr[1]) - 0x40
+                return 157 * (_byteord(aStr[0]) - 0xA4) + _byteord(aStr[1]) - 0x40
         else:
             return -1
 
@@ -171,15 +171,15 @@ def get_order(self, aStr):
         #   first  byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
         #   second byte range: 0x40 -- 0x7e,  0x81 -- oxfe
         # no validation needed here. State machine has done that
-        if (aStr[0] >= '\x81') and (aStr[0] <= '\x9F'):
-            order = 188 * (ord(aStr[0]) - 0x81)
-        elif (aStr[0] >= '\xE0') and (aStr[0] <= '\xEF'):
-            order = 188 * (ord(aStr[0]) - 0xE0 + 31)
+        if (_bytechar(0x81) <= aStr[0] <= _bytechar(0x9F)):
+            order = 188 * (_byteord(aStr[0]) - 0x81)
+        elif (_bytechar(0xE0) <= aStr[0] <= _bytechar(0xEF)):
+            order = 188 * (_byteord(aStr[0]) - 0xE0 + 31)
         else:
             return -1;
-        order = order + ord(aStr[1]) - 0x40
-        if aStr[1] > '\x7F':
-            order =- 1
+        order = order + _byteord(aStr[1]) - 0x40
+        if aStr[1] > _bytechar(0x7F):
+            order = -1
         return order
 
 class EUCJPDistributionAnalysis(CharDistributionAnalysis):
@@ -194,7 +194,7 @@ def get_order(self, aStr):
         #   first  byte range: 0xa0 -- 0xfe
         #   second byte range: 0xa1 -- 0xfe
         # no validation needed here. State machine has done that
-        if aStr[0] >= '\xA0':
-            return 94 * (ord(aStr[0]) - 0xA1) + ord(aStr[1]) - 0xa1
+        if aStr[0] >= _bytechar(0xA0):
+            return 94 * (_byteord(aStr[0]) - 0xA1) + _byteord(aStr[1]) - 0xa1
         else:
             return -1
diff --git a/chardet/charsetgroupprober.py b/chardet/charsetgroupprober.py
@@ -25,7 +25,8 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################
 
-import constants, sys
+import sys
+from chardet import constants
 from charsetprober import CharSetProber
 
 class CharSetGroupProber(CharSetProber):
@@ -41,7 +42,7 @@ def reset(self):
         for prober in self._mProbers:
             if prober:
                 prober.reset()
-                prober.active = constants.True
+                prober.active = True
                 self._mActiveNum += 1
         self._mBestGuessProber = None
 
@@ -62,7 +63,7 @@ def feed(self, aBuf):
                 self._mBestGuessProber = prober
                 return self.get_state()
             elif st == constants.eNotMe:
-                prober.active = constants.False
+                prober.active = False
                 self._mActiveNum -= 1
                 if self._mActiveNum <= 0:
                     self._mState = constants.eNotMe

diff --git a/chardet/charsetprober.py b/chardet/charsetprober.py
@@ -26,7 +26,9 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################
 
-import constants, re
+import re
+from chardet import constants
+from chardet.compat import _b
 
 class CharSetProber:
     def __init__(self):
@@ -48,11 +50,11 @@ def get_confidence(self):
         return 0.0
 
     def filter_high_bit_only(self, aBuf):
-        aBuf = re.sub(r'([\x00-\x7F])+', ' ', aBuf)
+        aBuf = re.sub(_b(r'([\x00-\x7F])+'), _b(' '), aBuf)
         return aBuf
 
     def filter_without_english_letters(self, aBuf):
-        aBuf = re.sub(r'([A-Za-z])+', ' ', aBuf)
+        aBuf = re.sub(_b(r'([A-Za-z])+'), _b(' '), aBuf)
         return aBuf
 
     def filter_with_english_letters(self, aBuf):

diff --git a/chardet/codingstatemachine.py b/chardet/codingstatemachine.py
@@ -25,7 +25,8 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################
 
-from constants import eStart, eError, eItsMe
+from chardet.compat import _byteord
+from chardet.constants import eStart, eError
 
 class CodingStateMachine:
     def __init__(self, sm):
@@ -41,7 +42,7 @@ def next_state(self, c):
         # for each byte we get its class
         # if it is first byte, we also get byte length
         try:
-            byteCls = self._mModel['classTable'][ord(c)]
+            byteCls = self._mModel['classTable'][_byteord(c)]
         except IndexError:
             return eError
         if self._mCurrentState == eStart:

diff --git a/chardet/compat.py b/chardet/compat.py
@@ -0,0 +1,10 @@
+import sys
+
+if sys.version_info >= (3, ):
+    _b = lambda _: _.encode('ascii')
+    _bytechar = int
+    _byteord = int
+else:
+    _b = str
+    _bytechar = chr
+    _byteord = ord
diff --git a/chardet/constants.py b/chardet/constants.py
@@ -37,11 +37,3 @@
 eItsMe = 2
 
 SHORTCUT_THRESHOLD = 0.95
-
-import __builtin__
-if not hasattr(__builtin__, 'False'):
-    False = 0
-    True = 1
-else:
-    False = __builtin__.False
-    True = __builtin__.True
diff --git a/chardet/escprober.py b/chardet/escprober.py
@@ -25,7 +25,7 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################
 
-import constants, sys
+from chardet import constants
 from escsm import HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, ISO2022KRSMModel
 from charsetprober import CharSetProber
 from codingstatemachine import CodingStateMachine
@@ -45,7 +45,7 @@ def reset(self):
         CharSetProber.reset(self)
         for codingSM in self._mCodingSM:
             if not codingSM: continue
-            codingSM.active = constants.True
+            codingSM.active = True
             codingSM.reset()
         self._mActiveSM = len(self._mCodingSM)
         self._mDetectedCharset = None
@@ -66,7 +66,7 @@ def feed(self, aBuf):
                 if not codingSM.active: continue
                 codingState = codingSM.next_state(c)
                 if codingState == constants.eError:
-                    codingSM.active = constants.False
+                    codingSM.active = False
                     self._mActiveSM -= 1
                     if self._mActiveSM <= 0:
                         self._mState = constants.eNotMe

diff --git a/chardet/escsm.py b/chardet/escsm.py
@@ -25,7 +25,7 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################
 
-from constants import eStart, eError, eItsMe
+from chardet.constants import eStart, eError, eItsMe
 
 HZ_cls = ( \
 1,0,0,0,0,0,0,0,  # 00 - 07 

diff --git a/chardet/eucjpprober.py b/chardet/eucjpprober.py
@@ -25,8 +25,9 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################
 
-import constants, sys
-from constants import eStart, eError, eItsMe
+import sys
+from chardet import constants
+from chardet.constants import eStart, eError, eItsMe
 from mbcharsetprober import MultiByteCharSetProber
 from codingstatemachine import CodingStateMachine
 from chardistribution import EUCJPDistributionAnalysis

diff --git a/chardet/hebrewprober.py b/chardet/hebrewprober.py
@@ -26,7 +26,7 @@
 ######################### END LICENSE BLOCK #########################
 
 from charsetprober import CharSetProber
-import constants
+from chardet import constants
 
 # This prober doesn't actually recognize a language or a charset.
 # It is a helper prober for the use of the Hebrew model probers

diff --git a/chardet/jpcntx.py b/chardet/jpcntx.py
@@ -25,7 +25,7 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################
 
-import constants
+from chardet.compat import _bytechar, _byteord
 
 NUM_OF_CATEGORY = 6
 DONT_KNOW = -1
@@ -129,7 +129,7 @@ def reset(self):
         self._mRelSample = [0] * NUM_OF_CATEGORY # category counters, each interger counts sequence in its category
         self._mNeedToSkipCharNum = 0 # if last byte in current buffer is not the last byte of a character, we need to know how many bytes to skip in next buffer
         self._mLastCharOrder = -1 # The order of previous char
-        self._mDone = constants.False # If this flag is set to constants.True, detection is done and conclusion has been made
+        self._mDone = False # If this flag is set to True, detection is done and conclusion has been made
 
     def feed(self, aBuf, aLen):
         if self._mDone: return
@@ -151,7 +151,7 @@ def feed(self, aBuf, aLen):
                 if (order != -1) and (self._mLastCharOrder != -1):
                     self._mTotalRel += 1
                     if self._mTotalRel > MAX_REL_THRESHOLD:
-                        self._mDone = constants.True
+                        self._mDone = True
                         break
                     self._mRelSample[jp2CharContext[self._mLastCharOrder][order]] += 1
                 self._mLastCharOrder = order
@@ -174,8 +174,8 @@ def get_order(self, aStr):
         if not aStr: return -1, 1
         # find out current char's byte length
         try:
-            if ((aStr[0] >= '\x81') and (aStr[0] <= '\x9F')) or \
-               ((aStr[0] >= '\xE0') and (aStr[0] <= '\xFC')):
+            if (_bytechar(0x81) <= aStr[0] <= _bytechar(0x9F)) or \
+               (_bytechar(0xE0) <= aStr[0] <= _bytechar(0xFC)):
                 charLen = 2
             else:
                 charLen = 1
@@ -184,10 +184,9 @@ def get_order(self, aStr):
 
         # return its order if it is hiragana
         if len(aStr) > 1:
-            if (aStr[0] == '\202') and \
-               (aStr[1] >= '\x9F') and \
-               (aStr[1] <= '\xF1'):
-                return ord(aStr[1]) - 0x9F, charLen
+            if (aStr[0] == _bytechar(202)) and \
+               (_bytechar(0x9F) <= aStr[1] <= _bytechar(0xF1)):
+                return _byteord(aStr[1]) - 0x9F, charLen
 
         return -1, charLen
 
@@ -196,10 +195,10 @@ def get_order(self, aStr):
         if not aStr: return -1, 1
         # find out current char's byte length
         try:
-            if (aStr[0] == '\x8E') or \
-               ((aStr[0] >= '\xA1') and (aStr[0] <= '\xFE')):
+            if (aStr[0] == _bytechar(0x8E)) or \
+               (_bytechar(0xA1) <= aStr[0] <= _bytechar(0xFE)):
                 charLen = 2
-            elif aStr[0] == '\x8F':
+            elif aStr[0] == _bytechar(0x8F):
                 charLen = 3
             else:
                 charLen = 1
@@ -208,9 +207,8 @@ def get_order(self, aStr):
 
         # return its order if it is hiragana
         if len(aStr) > 1:
-            if (aStr[0] == '\xA4') and \
-               (aStr[1] >= '\xA1') and \
-               (aStr[1] <= '\xF3'):
-                return ord(aStr[1]) - 0xA1, charLen
+            if (aStr[0] == _bytechar(0xA4)) and \
+               (_bytechar(0xA1) <= aStr[1] <= _bytechar(0xF3)):
+                return _byteord(aStr[1]) - 0xA1, charLen
 
         return -1, charLen
diff --git a/chardet/langbulgarianmodel.py b/chardet/langbulgarianmodel.py
@@ -25,8 +25,6 @@
 # 02110-1301  USA
 ######################### END LICENSE BLOCK #########################
 
-import constants
-
 # 255: Control characters that usually does not exist in any text
 # 254: Carriage/Return
 # 253: symbol (punctuation) that does not belong to word
@@ -215,14 +213,14 @@
   'charToOrderMap': Latin5_BulgarianCharToOrderMap,
   'precedenceMatrix': BulgarianLangModel,
   'mTypicalPositiveRatio': 0.969392,
-  'keepEnglishLetter': constants.False,
+  'keepEnglishLetter': False,
   'charsetName': "ISO-8859-5"
 }
 
 Win1251BulgarianModel = { \
   'charToOrderMap': win1251BulgarianCharToOrderMap,
   'precedenceMatrix': BulgarianLangModel,
   'mTypicalPositiveRatio': 0.969392,
-  'keepEnglishLetter': constants.False,
+  'keepEnglishLetter': False,
   'charsetName': "windows-1251"
 }