diff --git a/chardet/chardistribution.py b/chardet/chardistribution.py index b893341..db3f985 100755 --- a/chardet/chardistribution.py +++ b/chardet/chardistribution.py @@ -25,7 +25,7 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants +from chardet.compat import _bytechar, _byteord from euctwfreq import EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE, EUCTW_TYPICAL_DISTRIBUTION_RATIO from euckrfreq import EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE, EUCKR_TYPICAL_DISTRIBUTION_RATIO from gb2312freq import GB2312CharToFreqOrder, GB2312_TABLE_SIZE, GB2312_TYPICAL_DISTRIBUTION_RATIO @@ -45,7 +45,7 @@ def __init__(self): def reset(self): """reset analyser, clear any state""" - self._mDone = constants.False # If this flag is set to constants.True, detection is done and conclusion has been made + self._mDone = False # If this flag is set to True, detection is done and conclusion has been made self._mTotalChars = 0 # Total characters encountered self._mFreqChars = 0 # The number of characters whose frequency order is less than 512 @@ -100,8 +100,8 @@ def get_order(self, aStr): # first byte range: 0xc4 -- 0xfe # second byte range: 0xa1 -- 0xfe # no validation needed here. State machine has done that - if aStr[0] >= '\xC4': - return 94 * (ord(aStr[0]) - 0xC4) + ord(aStr[1]) - 0xA1 + if aStr[0] >= _bytechar(0xC4): + return 94 * (_byteord(aStr[0]) - 0xC4) + _byteord(aStr[1]) - 0xA1 else: return -1 @@ -117,8 +117,8 @@ def get_order(self, aStr): # first byte range: 0xb0 -- 0xfe # second byte range: 0xa1 -- 0xfe # no validation needed here. State machine has done that - if aStr[0] >= '\xB0': - return 94 * (ord(aStr[0]) - 0xB0) + ord(aStr[1]) - 0xA1 + if aStr[0] >= _bytechar(0xB0): + return 94 * (_byteord(aStr[0]) - 0xB0) + _byteord(aStr[1]) - 0xA1 else: return -1; @@ -134,8 +134,8 @@ def get_order(self, aStr): # first byte range: 0xb0 -- 0xfe # second byte range: 0xa1 -- 0xfe # no validation needed here. State machine has done that - if (aStr[0] >= '\xB0') and (aStr[1] >= '\xA1'): - return 94 * (ord(aStr[0]) - 0xB0) + ord(aStr[1]) - 0xA1 + if (aStr[0] >= _bytechar(0xB0)) and (aStr[1] >= _bytechar(0xA1)): + return 94 * (_byteord(aStr[0]) - 0xB0) + _byteord(aStr[1]) - 0xA1 else: return -1; @@ -151,11 +151,11 @@ def get_order(self, aStr): # first byte range: 0xa4 -- 0xfe # second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe # no validation needed here. State machine has done that - if aStr[0] >= '\xA4': - if aStr[1] >= '\xA1': - return 157 * (ord(aStr[0]) - 0xA4) + ord(aStr[1]) - 0xA1 + 63 + if aStr[0] >= _bytechar(0xA4): + if aStr[1] >= _bytechar(0xA1): + return 157 * (_byteord(aStr[0]) - 0xA4) + _byteord(aStr[1]) - 0xA1 + 63 else: - return 157 * (ord(aStr[0]) - 0xA4) + ord(aStr[1]) - 0x40 + return 157 * (_byteord(aStr[0]) - 0xA4) + _byteord(aStr[1]) - 0x40 else: return -1 @@ -171,15 +171,15 @@ def get_order(self, aStr): # first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe # second byte range: 0x40 -- 0x7e, 0x81 -- oxfe # no validation needed here. State machine has done that - if (aStr[0] >= '\x81') and (aStr[0] <= '\x9F'): - order = 188 * (ord(aStr[0]) - 0x81) - elif (aStr[0] >= '\xE0') and (aStr[0] <= '\xEF'): - order = 188 * (ord(aStr[0]) - 0xE0 + 31) + if (_bytechar(0x81) <= aStr[0] <= _bytechar(0x9F)): + order = 188 * (_byteord(aStr[0]) - 0x81) + elif (_bytechar(0xE0) <= aStr[0] <= _bytechar(0xEF)): + order = 188 * (_byteord(aStr[0]) - 0xE0 + 31) else: return -1; - order = order + ord(aStr[1]) - 0x40 - if aStr[1] > '\x7F': - order =- 1 + order = order + _byteord(aStr[1]) - 0x40 + if aStr[1] > _bytechar(0x7F): + order = -1 return order class EUCJPDistributionAnalysis(CharDistributionAnalysis): @@ -194,7 +194,7 @@ def get_order(self, aStr): # first byte range: 0xa0 -- 0xfe # second byte range: 0xa1 -- 0xfe # no validation needed here. State machine has done that - if aStr[0] >= '\xA0': - return 94 * (ord(aStr[0]) - 0xA1) + ord(aStr[1]) - 0xa1 + if aStr[0] >= _bytechar(0xA0): + return 94 * (_byteord(aStr[0]) - 0xA1) + _byteord(aStr[1]) - 0xa1 else: return -1 diff --git a/chardet/charsetgroupprober.py b/chardet/charsetgroupprober.py index 5188069..fd05c2b 100755 --- a/chardet/charsetgroupprober.py +++ b/chardet/charsetgroupprober.py @@ -25,7 +25,8 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants, sys +import sys +from chardet import constants from charsetprober import CharSetProber class CharSetGroupProber(CharSetProber): @@ -41,7 +42,7 @@ def reset(self): for prober in self._mProbers: if prober: prober.reset() - prober.active = constants.True + prober.active = True self._mActiveNum += 1 self._mBestGuessProber = None @@ -62,7 +63,7 @@ def feed(self, aBuf): self._mBestGuessProber = prober return self.get_state() elif st == constants.eNotMe: - prober.active = constants.False + prober.active = False self._mActiveNum -= 1 if self._mActiveNum <= 0: self._mState = constants.eNotMe diff --git a/chardet/charsetprober.py b/chardet/charsetprober.py index 3ac1683..efa2661 100755 --- a/chardet/charsetprober.py +++ b/chardet/charsetprober.py @@ -26,7 +26,9 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants, re +import re +from chardet import constants +from chardet.compat import _b class CharSetProber: def __init__(self): @@ -48,11 +50,11 @@ def get_confidence(self): return 0.0 def filter_high_bit_only(self, aBuf): - aBuf = re.sub(r'([\x00-\x7F])+', ' ', aBuf) + aBuf = re.sub(_b(r'([\x00-\x7F])+'), _b(' '), aBuf) return aBuf def filter_without_english_letters(self, aBuf): - aBuf = re.sub(r'([A-Za-z])+', ' ', aBuf) + aBuf = re.sub(_b(r'([A-Za-z])+'), _b(' '), aBuf) return aBuf def filter_with_english_letters(self, aBuf): diff --git a/chardet/codingstatemachine.py b/chardet/codingstatemachine.py index 92d595f..eefe3ab 100755 --- a/chardet/codingstatemachine.py +++ b/chardet/codingstatemachine.py @@ -25,7 +25,8 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from constants import eStart, eError, eItsMe +from chardet.compat import _byteord +from chardet.constants import eStart, eError class CodingStateMachine: def __init__(self, sm): @@ -41,7 +42,7 @@ def next_state(self, c): # for each byte we get its class # if it is first byte, we also get byte length try: - byteCls = self._mModel['classTable'][ord(c)] + byteCls = self._mModel['classTable'][_byteord(c)] except IndexError: return eError if self._mCurrentState == eStart: diff --git a/chardet/compat.py b/chardet/compat.py new file mode 100644 index 0000000..86e76bd --- /dev/null +++ b/chardet/compat.py @@ -0,0 +1,10 @@ +import sys + +if sys.version_info >= (3, ): + _b = lambda _: _.encode('ascii') + _bytechar = int + _byteord = int +else: + _b = str + _bytechar = chr + _byteord = ord diff --git a/chardet/constants.py b/chardet/constants.py index e94e226..e4d148b 100755 --- a/chardet/constants.py +++ b/chardet/constants.py @@ -37,11 +37,3 @@ eItsMe = 2 SHORTCUT_THRESHOLD = 0.95 - -import __builtin__ -if not hasattr(__builtin__, 'False'): - False = 0 - True = 1 -else: - False = __builtin__.False - True = __builtin__.True diff --git a/chardet/escprober.py b/chardet/escprober.py index 572ed7b..13406d2 100755 --- a/chardet/escprober.py +++ b/chardet/escprober.py @@ -25,7 +25,7 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants, sys +from chardet import constants from escsm import HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, ISO2022KRSMModel from charsetprober import CharSetProber from codingstatemachine import CodingStateMachine @@ -45,7 +45,7 @@ def reset(self): CharSetProber.reset(self) for codingSM in self._mCodingSM: if not codingSM: continue - codingSM.active = constants.True + codingSM.active = True codingSM.reset() self._mActiveSM = len(self._mCodingSM) self._mDetectedCharset = None @@ -66,7 +66,7 @@ def feed(self, aBuf): if not codingSM.active: continue codingState = codingSM.next_state(c) if codingState == constants.eError: - codingSM.active = constants.False + codingSM.active = False self._mActiveSM -= 1 if self._mActiveSM <= 0: self._mState = constants.eNotMe diff --git a/chardet/escsm.py b/chardet/escsm.py index 9fa2295..9c98e8e 100755 --- a/chardet/escsm.py +++ b/chardet/escsm.py @@ -25,7 +25,7 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from constants import eStart, eError, eItsMe +from chardet.constants import eStart, eError, eItsMe HZ_cls = ( \ 1,0,0,0,0,0,0,0, # 00 - 07 diff --git a/chardet/eucjpprober.py b/chardet/eucjpprober.py index 46a8b38..4dea047 100755 --- a/chardet/eucjpprober.py +++ b/chardet/eucjpprober.py @@ -25,8 +25,9 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants, sys -from constants import eStart, eError, eItsMe +import sys +from chardet import constants +from chardet.constants import eStart, eError, eItsMe from mbcharsetprober import MultiByteCharSetProber from codingstatemachine import CodingStateMachine from chardistribution import EUCJPDistributionAnalysis diff --git a/chardet/hebrewprober.py b/chardet/hebrewprober.py index a2b1eaa..e3f4e87 100755 --- a/chardet/hebrewprober.py +++ b/chardet/hebrewprober.py @@ -26,7 +26,7 @@ ######################### END LICENSE BLOCK ######################### from charsetprober import CharSetProber -import constants +from chardet import constants # This prober doesn't actually recognize a language or a charset. # It is a helper prober for the use of the Hebrew model probers diff --git a/chardet/jpcntx.py b/chardet/jpcntx.py index b02a34d..1ec81a8 100755 --- a/chardet/jpcntx.py +++ b/chardet/jpcntx.py @@ -25,7 +25,7 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants +from chardet.compat import _bytechar, _byteord NUM_OF_CATEGORY = 6 DONT_KNOW = -1 @@ -129,7 +129,7 @@ def reset(self): self._mRelSample = [0] * NUM_OF_CATEGORY # category counters, each interger counts sequence in its category self._mNeedToSkipCharNum = 0 # if last byte in current buffer is not the last byte of a character, we need to know how many bytes to skip in next buffer self._mLastCharOrder = -1 # The order of previous char - self._mDone = constants.False # If this flag is set to constants.True, detection is done and conclusion has been made + self._mDone = False # If this flag is set to True, detection is done and conclusion has been made def feed(self, aBuf, aLen): if self._mDone: return @@ -151,7 +151,7 @@ def feed(self, aBuf, aLen): if (order != -1) and (self._mLastCharOrder != -1): self._mTotalRel += 1 if self._mTotalRel > MAX_REL_THRESHOLD: - self._mDone = constants.True + self._mDone = True break self._mRelSample[jp2CharContext[self._mLastCharOrder][order]] += 1 self._mLastCharOrder = order @@ -174,8 +174,8 @@ def get_order(self, aStr): if not aStr: return -1, 1 # find out current char's byte length try: - if ((aStr[0] >= '\x81') and (aStr[0] <= '\x9F')) or \ - ((aStr[0] >= '\xE0') and (aStr[0] <= '\xFC')): + if (_bytechar(0x81) <= aStr[0] <= _bytechar(0x9F)) or \ + (_bytechar(0xE0) <= aStr[0] <= _bytechar(0xFC)): charLen = 2 else: charLen = 1 @@ -184,10 +184,9 @@ def get_order(self, aStr): # return its order if it is hiragana if len(aStr) > 1: - if (aStr[0] == '\202') and \ - (aStr[1] >= '\x9F') and \ - (aStr[1] <= '\xF1'): - return ord(aStr[1]) - 0x9F, charLen + if (aStr[0] == _bytechar(202)) and \ + (_bytechar(0x9F) <= aStr[1] <= _bytechar(0xF1)): + return _byteord(aStr[1]) - 0x9F, charLen return -1, charLen @@ -196,10 +195,10 @@ def get_order(self, aStr): if not aStr: return -1, 1 # find out current char's byte length try: - if (aStr[0] == '\x8E') or \ - ((aStr[0] >= '\xA1') and (aStr[0] <= '\xFE')): + if (aStr[0] == _bytechar(0x8E)) or \ + (_bytechar(0xA1) <= aStr[0] <= _bytechar(0xFE)): charLen = 2 - elif aStr[0] == '\x8F': + elif aStr[0] == _bytechar(0x8F): charLen = 3 else: charLen = 1 @@ -208,9 +207,8 @@ def get_order(self, aStr): # return its order if it is hiragana if len(aStr) > 1: - if (aStr[0] == '\xA4') and \ - (aStr[1] >= '\xA1') and \ - (aStr[1] <= '\xF3'): - return ord(aStr[1]) - 0xA1, charLen + if (aStr[0] == _bytechar(0xA4)) and \ + (_bytechar(0xA1) <= aStr[1] <= _bytechar(0xF3)): + return _byteord(aStr[1]) - 0xA1, charLen return -1, charLen diff --git a/chardet/langbulgarianmodel.py b/chardet/langbulgarianmodel.py index bf5641e..485e873 100755 --- a/chardet/langbulgarianmodel.py +++ b/chardet/langbulgarianmodel.py @@ -25,8 +25,6 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants - # 255: Control characters that usually does not exist in any text # 254: Carriage/Return # 253: symbol (punctuation) that does not belong to word @@ -215,7 +213,7 @@ 'charToOrderMap': Latin5_BulgarianCharToOrderMap, 'precedenceMatrix': BulgarianLangModel, 'mTypicalPositiveRatio': 0.969392, - 'keepEnglishLetter': constants.False, + 'keepEnglishLetter': False, 'charsetName': "ISO-8859-5" } @@ -223,6 +221,6 @@ 'charToOrderMap': win1251BulgarianCharToOrderMap, 'precedenceMatrix': BulgarianLangModel, 'mTypicalPositiveRatio': 0.969392, - 'keepEnglishLetter': constants.False, + 'keepEnglishLetter': False, 'charsetName': "windows-1251" } diff --git a/chardet/langcyrillicmodel.py b/chardet/langcyrillicmodel.py index e604cc7..17fdbf7 100755 --- a/chardet/langcyrillicmodel.py +++ b/chardet/langcyrillicmodel.py @@ -25,8 +25,6 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants - # KOI8-R language model # Character Mapping Table: KOI8R_CharToOrderMap = ( \ @@ -284,7 +282,7 @@ 'charToOrderMap': KOI8R_CharToOrderMap, 'precedenceMatrix': RussianLangModel, 'mTypicalPositiveRatio': 0.976601, - 'keepEnglishLetter': constants.False, + 'keepEnglishLetter': False, 'charsetName': "KOI8-R" } @@ -292,7 +290,7 @@ 'charToOrderMap': win1251_CharToOrderMap, 'precedenceMatrix': RussianLangModel, 'mTypicalPositiveRatio': 0.976601, - 'keepEnglishLetter': constants.False, + 'keepEnglishLetter': False, 'charsetName': "windows-1251" } @@ -300,7 +298,7 @@ 'charToOrderMap': latin5_CharToOrderMap, 'precedenceMatrix': RussianLangModel, 'mTypicalPositiveRatio': 0.976601, - 'keepEnglishLetter': constants.False, + 'keepEnglishLetter': False, 'charsetName': "ISO-8859-5" } @@ -308,7 +306,7 @@ 'charToOrderMap': macCyrillic_CharToOrderMap, 'precedenceMatrix': RussianLangModel, 'mTypicalPositiveRatio': 0.976601, - 'keepEnglishLetter': constants.False, + 'keepEnglishLetter': False, 'charsetName': "MacCyrillic" }; @@ -316,7 +314,7 @@ 'charToOrderMap': IBM866_CharToOrderMap, 'precedenceMatrix': RussianLangModel, 'mTypicalPositiveRatio': 0.976601, - 'keepEnglishLetter': constants.False, + 'keepEnglishLetter': False, 'charsetName': "IBM866" } @@ -324,6 +322,6 @@ 'charToOrderMap': IBM855_CharToOrderMap, 'precedenceMatrix': RussianLangModel, 'mTypicalPositiveRatio': 0.976601, - 'keepEnglishLetter': constants.False, + 'keepEnglishLetter': False, 'charsetName': "IBM855" } diff --git a/chardet/langgreekmodel.py b/chardet/langgreekmodel.py index ec6d49e..6e2104d 100755 --- a/chardet/langgreekmodel.py +++ b/chardet/langgreekmodel.py @@ -25,8 +25,6 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants - # 255: Control characters that usually does not exist in any text # 254: Carriage/Return # 253: symbol (punctuation) that does not belong to word @@ -212,7 +210,7 @@ 'charToOrderMap': Latin7_CharToOrderMap, 'precedenceMatrix': GreekLangModel, 'mTypicalPositiveRatio': 0.982851, - 'keepEnglishLetter': constants.False, + 'keepEnglishLetter': False, 'charsetName': "ISO-8859-7" } @@ -220,6 +218,6 @@ 'charToOrderMap': win1253_CharToOrderMap, 'precedenceMatrix': GreekLangModel, 'mTypicalPositiveRatio': 0.982851, - 'keepEnglishLetter': constants.False, + 'keepEnglishLetter': False, 'charsetName': "windows-1253" } diff --git a/chardet/langhebrewmodel.py b/chardet/langhebrewmodel.py index a8bcc65..cc74018 100755 --- a/chardet/langhebrewmodel.py +++ b/chardet/langhebrewmodel.py @@ -27,8 +27,6 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants - # 255: Control characters that usually does not exist in any text # 254: Carriage/Return # 253: symbol (punctuation) that does not belong to word @@ -196,6 +194,6 @@ 'charToOrderMap': win1255_CharToOrderMap, 'precedenceMatrix': HebrewLangModel, 'mTypicalPositiveRatio': 0.984004, - 'keepEnglishLetter': constants.False, + 'keepEnglishLetter': False, 'charsetName': "windows-1255" } diff --git a/chardet/langhungarianmodel.py b/chardet/langhungarianmodel.py index d635f03..9cae594 100755 --- a/chardet/langhungarianmodel.py +++ b/chardet/langhungarianmodel.py @@ -25,8 +25,6 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants - # 255: Control characters that usually does not exist in any text # 254: Carriage/Return # 253: symbol (punctuation) that does not belong to word @@ -212,7 +210,7 @@ 'charToOrderMap': Latin2_HungarianCharToOrderMap, 'precedenceMatrix': HungarianLangModel, 'mTypicalPositiveRatio': 0.947368, - 'keepEnglishLetter': constants.True, + 'keepEnglishLetter': True, 'charsetName': "ISO-8859-2" } @@ -220,6 +218,6 @@ 'charToOrderMap': win1250HungarianCharToOrderMap, 'precedenceMatrix': HungarianLangModel, 'mTypicalPositiveRatio': 0.947368, - 'keepEnglishLetter': constants.True, + 'keepEnglishLetter': True, 'charsetName': "windows-1250" } diff --git a/chardet/langthaimodel.py b/chardet/langthaimodel.py index 96ec054..543a361 100755 --- a/chardet/langthaimodel.py +++ b/chardet/langthaimodel.py @@ -25,8 +25,6 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants - # 255: Control characters that usually does not exist in any text # 254: Carriage/Return # 253: symbol (punctuation) that does not belong to word @@ -195,6 +193,6 @@ 'charToOrderMap': TIS620CharToOrderMap, 'precedenceMatrix': ThaiLangModel, 'mTypicalPositiveRatio': 0.926386, - 'keepEnglishLetter': constants.False, + 'keepEnglishLetter': False, 'charsetName': "TIS-620" } diff --git a/chardet/latin1prober.py b/chardet/latin1prober.py index abaaf98..073f894 100755 --- a/chardet/latin1prober.py +++ b/chardet/latin1prober.py @@ -27,7 +27,8 @@ ######################### END LICENSE BLOCK ######################### from charsetprober import CharSetProber -import constants +from chardet import constants +from chardet.compat import _byteord import operator FREQ_CAT_NUM = 4 @@ -110,7 +111,7 @@ def feed(self, aBuf): aBuf = self.filter_with_english_letters(aBuf) for c in aBuf: try: - charClass = Latin1_CharToClass[ord(c)] + charClass = Latin1_CharToClass[_byteord(c)] except IndexError: return constants.eError freq = Latin1ClassModel[(self._mLastCharClass * CLASS_NUM) + charClass] diff --git a/chardet/mbcharsetprober.py b/chardet/mbcharsetprober.py index a813144..ec3c8f3 100755 --- a/chardet/mbcharsetprober.py +++ b/chardet/mbcharsetprober.py @@ -27,7 +27,9 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants, sys +import sys +from chardet import constants +from chardet.compat import _bytechar from constants import eStart, eError, eItsMe from charsetprober import CharSetProber @@ -36,7 +38,7 @@ def __init__(self): CharSetProber.__init__(self) self._mDistributionAnalyzer = None self._mCodingSM = None - self._mLastChar = ['\x00', '\x00'] + self._mLastChar = [_bytechar(0), _bytechar(0)] def reset(self): CharSetProber.reset(self) @@ -44,7 +46,7 @@ def reset(self): self._mCodingSM.reset() if self._mDistributionAnalyzer: self._mDistributionAnalyzer.reset() - self._mLastChar = ['\x00', '\x00'] + self._mLastChar = [_bytechar(0), _bytechar(0)] def get_charset_name(self): pass diff --git a/chardet/mbcssm.py b/chardet/mbcssm.py index e46c1ff..0f0497d 100755 --- a/chardet/mbcssm.py +++ b/chardet/mbcssm.py @@ -25,7 +25,7 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -from constants import eStart, eError, eItsMe +from chardet.constants import eStart, eError, eItsMe # BIG5 diff --git a/chardet/sbcharsetprober.py b/chardet/sbcharsetprober.py index 6a585f7..78fce72 100755 --- a/chardet/sbcharsetprober.py +++ b/chardet/sbcharsetprober.py @@ -26,7 +26,9 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants, sys +import sys +from chardet import constants +from chardet.compat import _byteord from charsetprober import CharSetProber SAMPLE_SIZE = 64 @@ -39,7 +41,7 @@ #NEGATIVE_CAT = 0 class SingleByteCharSetProber(CharSetProber): - def __init__(self, model, reversed=constants.False, nameProber=None): + def __init__(self, model, reversed=False, nameProber=None): CharSetProber.__init__(self) self._mModel = model self._mReversed = reversed # TRUE if we need to reverse every pair in the model lookup @@ -68,7 +70,7 @@ def feed(self, aBuf): return self.get_state() for c in aBuf: try: - order = self._mModel['charToOrderMap'][ord(c)] + order = self._mModel['charToOrderMap'][_byteord(c)] except IndexError: return constants.eError if order < SYMBOL_CAT_ORDER: diff --git a/chardet/sbcsgroupprober.py b/chardet/sbcsgroupprober.py index d19160c..f8c13de 100755 --- a/chardet/sbcsgroupprober.py +++ b/chardet/sbcsgroupprober.py @@ -26,7 +26,6 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants, sys from charsetgroupprober import CharSetGroupProber from sbcharsetprober import SingleByteCharSetProber from langcyrillicmodel import Win1251CyrillicModel, Koi8rModel, Latin5CyrillicModel, MacCyrillicModel, Ibm866Model, Ibm855Model @@ -56,8 +55,8 @@ def __init__(self): SingleByteCharSetProber(TIS620ThaiModel), ] hebrewProber = HebrewProber() - logicalHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, constants.False, hebrewProber) - visualHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, constants.True, hebrewProber) + logicalHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, False, hebrewProber) + visualHebrewProber = SingleByteCharSetProber(Win1255HebrewModel, True, hebrewProber) hebrewProber.set_model_probers(logicalHebrewProber, visualHebrewProber) self._mProbers.extend([hebrewProber, logicalHebrewProber, visualHebrewProber]) diff --git a/chardet/sjisprober.py b/chardet/sjisprober.py index 2e7a234..2316576 100755 --- a/chardet/sjisprober.py +++ b/chardet/sjisprober.py @@ -30,8 +30,9 @@ from chardistribution import SJISDistributionAnalysis from jpcntx import SJISContextAnalysis from mbcssm import SJISSMModel -import constants, sys -from constants import eStart, eError, eItsMe +import sys +from chardet import constants +from chardet.constants import eStart, eError, eItsMe class SJISProber(MultiByteCharSetProber): def __init__(self): diff --git a/chardet/universaldetector.py b/chardet/universaldetector.py index c822aa7..e451e64 100755 --- a/chardet/universaldetector.py +++ b/chardet/universaldetector.py @@ -26,7 +26,9 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants, sys +import sys +from chardet import constants +from chardet.compat import _b from latin1prober import Latin1Prober # windows-1252 from mbcsgroupprober import MBCSGroupProber # multi-byte character sets from sbcsgroupprober import SBCSGroupProber # single-byte character sets @@ -43,19 +45,19 @@ class UniversalDetector: def __init__(self): - self._highBitDetector = re.compile(r'[\x80-\xFF]') - self._escDetector = re.compile(r'(\033|~{)') + self._highBitDetector = re.compile(_b(r'[\x80-\xFF]')) + self._escDetector = re.compile(_b(r'(\033|~{)')) self._mEscCharSetProber = None self._mCharSetProbers = [] self.reset() def reset(self): self.result = {'encoding': None, 'confidence': 0.0} - self.done = constants.False - self._mStart = constants.True - self._mGotData = constants.False + self.done = False + self._mStart = True + self._mGotData = False self._mInputState = ePureAscii - self._mLastChar = '' + self._mLastChar = _b('') if self._mEscCharSetProber: self._mEscCharSetProber.reset() for prober in self._mCharSetProbers: @@ -91,9 +93,9 @@ def feed(self, aBuf): self.result = result break - self._mGotData = constants.True + self._mGotData = True if self.result['encoding'] and (self.result['confidence'] > 0.0): - self.done = constants.True + self.done = True return if self._mInputState == ePureAscii: @@ -102,7 +104,7 @@ def feed(self, aBuf): elif (self._mInputState == ePureAscii) and self._escDetector.search(self._mLastChar + aBuf): self._mInputState = eEscAscii - self._mLastChar = aBuf[-1] + self._mLastChar = aBuf[-1:] if self._mInputState == eEscAscii: if not self._mEscCharSetProber: @@ -110,7 +112,7 @@ def feed(self, aBuf): if self._mEscCharSetProber.feed(aBuf) == constants.eFoundIt: self.result = {'encoding': self._mEscCharSetProber.get_charset_name(), 'confidence': self._mEscCharSetProber.get_confidence()} - self.done = constants.True + self.done = True elif self._mInputState == eHighbyte: if not self._mCharSetProbers: self._mCharSetProbers = [MBCSGroupProber(), SBCSGroupProber(), Latin1Prober()] @@ -119,7 +121,7 @@ def feed(self, aBuf): if prober.feed(aBuf) == constants.eFoundIt: self.result = {'encoding': prober.get_charset_name(), 'confidence': prober.get_confidence()} - self.done = constants.True + self.done = True break except (UnicodeDecodeError, UnicodeEncodeError), e: logger.exception(e) @@ -130,7 +132,7 @@ def close(self): if constants._debug: sys.stderr.write('no data received!\n') return - self.done = constants.True + self.done = True if self._mInputState == ePureAscii: self.result = {'encoding': 'ascii', 'confidence': 1.0} diff --git a/chardet/utf8prober.py b/chardet/utf8prober.py index c1792bb..f02c2a6 100755 --- a/chardet/utf8prober.py +++ b/chardet/utf8prober.py @@ -25,11 +25,11 @@ # 02110-1301 USA ######################### END LICENSE BLOCK ######################### -import constants, sys -from constants import eStart, eError, eItsMe -from charsetprober import CharSetProber -from codingstatemachine import CodingStateMachine -from mbcssm import UTF8SMModel +from chardet import constants +from chardet.constants import eStart, eError, eItsMe +from chardet.charsetprober import CharSetProber +from chardet.codingstatemachine import CodingStateMachine +from chardet.mbcssm import UTF8SMModel ONE_CHAR_PROB = 0.5 diff --git a/setup.py b/setup.py index 724c661..c08c0b1 100755 --- a/setup.py +++ b/setup.py @@ -1,3 +1,4 @@ +import sys from setuptools import setup # patch distutils if it can't cope with the "classifiers" or "download_url" @@ -8,6 +9,9 @@ if not hasattr(DistributionMetadata, 'download_url'): DistributionMetadata.download_url = None +kwargs = {} +if sys.version_info >= (3, ): + kwargs['use_2to3'] = True setup( name = 'chardet', version = '1.1', @@ -29,7 +33,7 @@ - ISO-8859-8, windows-1255 (Visual and Logical Hebrew) - TIS-620 (Thai) -Requires Python 2.1 or later +Requires Python 2.3 or later Command-line Tool ----------------- @@ -56,9 +60,21 @@ "License :: OSI Approved :: GNU Library or Lesser General Public License (LGPL)", "Operating System :: OS Independent", "Programming Language :: Python", + "Programming Language :: Python :: 2", + "Programming Language :: Python :: 2.3", + "Programming Language :: Python :: 2.4", + "Programming Language :: Python :: 2.5", + "Programming Language :: Python :: 2.6", + "Programming Language :: Python :: 2.7", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.1", + "Programming Language :: Python :: 3.2", + 'Programming Language :: Python :: Implementation :: CPython', + 'Programming Language :: Python :: Implementation :: PyPy', "Topic :: Software Development :: Libraries :: Python Modules", "Topic :: Text Processing :: Linguistic", ], scripts=['bin/chardetect.py'], - packages = ['chardet'] + packages = ['chardet'], + **kwargs )