Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 21 additions & 21 deletions chardet/chardistribution.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################

import constants
from chardet.compat import _bytechar, _byteord
from euctwfreq import EUCTWCharToFreqOrder, EUCTW_TABLE_SIZE, EUCTW_TYPICAL_DISTRIBUTION_RATIO
from euckrfreq import EUCKRCharToFreqOrder, EUCKR_TABLE_SIZE, EUCKR_TYPICAL_DISTRIBUTION_RATIO
from gb2312freq import GB2312CharToFreqOrder, GB2312_TABLE_SIZE, GB2312_TYPICAL_DISTRIBUTION_RATIO
Expand All @@ -45,7 +45,7 @@ def __init__(self):

def reset(self):
"""reset analyser, clear any state"""
self._mDone = constants.False # If this flag is set to constants.True, detection is done and conclusion has been made
self._mDone = False # If this flag is set to True, detection is done and conclusion has been made
self._mTotalChars = 0 # Total characters encountered
self._mFreqChars = 0 # The number of characters whose frequency order is less than 512

Expand Down Expand Up @@ -100,8 +100,8 @@ def get_order(self, aStr):
# first byte range: 0xc4 -- 0xfe
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
if aStr[0] >= '\xC4':
return 94 * (ord(aStr[0]) - 0xC4) + ord(aStr[1]) - 0xA1
if aStr[0] >= _bytechar(0xC4):
return 94 * (_byteord(aStr[0]) - 0xC4) + _byteord(aStr[1]) - 0xA1
else:
return -1

Expand All @@ -117,8 +117,8 @@ def get_order(self, aStr):
# first byte range: 0xb0 -- 0xfe
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
if aStr[0] >= '\xB0':
return 94 * (ord(aStr[0]) - 0xB0) + ord(aStr[1]) - 0xA1
if aStr[0] >= _bytechar(0xB0):
return 94 * (_byteord(aStr[0]) - 0xB0) + _byteord(aStr[1]) - 0xA1
else:
return -1;

Expand All @@ -134,8 +134,8 @@ def get_order(self, aStr):
# first byte range: 0xb0 -- 0xfe
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
if (aStr[0] >= '\xB0') and (aStr[1] >= '\xA1'):
return 94 * (ord(aStr[0]) - 0xB0) + ord(aStr[1]) - 0xA1
if (aStr[0] >= _bytechar(0xB0)) and (aStr[1] >= _bytechar(0xA1)):
return 94 * (_byteord(aStr[0]) - 0xB0) + _byteord(aStr[1]) - 0xA1
else:
return -1;

Expand All @@ -151,11 +151,11 @@ def get_order(self, aStr):
# first byte range: 0xa4 -- 0xfe
# second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
# no validation needed here. State machine has done that
if aStr[0] >= '\xA4':
if aStr[1] >= '\xA1':
return 157 * (ord(aStr[0]) - 0xA4) + ord(aStr[1]) - 0xA1 + 63
if aStr[0] >= _bytechar(0xA4):
if aStr[1] >= _bytechar(0xA1):
return 157 * (_byteord(aStr[0]) - 0xA4) + _byteord(aStr[1]) - 0xA1 + 63
else:
return 157 * (ord(aStr[0]) - 0xA4) + ord(aStr[1]) - 0x40
return 157 * (_byteord(aStr[0]) - 0xA4) + _byteord(aStr[1]) - 0x40
else:
return -1

Expand All @@ -171,15 +171,15 @@ def get_order(self, aStr):
# first byte range: 0x81 -- 0x9f , 0xe0 -- 0xfe
# second byte range: 0x40 -- 0x7e, 0x81 -- oxfe
# no validation needed here. State machine has done that
if (aStr[0] >= '\x81') and (aStr[0] <= '\x9F'):
order = 188 * (ord(aStr[0]) - 0x81)
elif (aStr[0] >= '\xE0') and (aStr[0] <= '\xEF'):
order = 188 * (ord(aStr[0]) - 0xE0 + 31)
if (_bytechar(0x81) <= aStr[0] <= _bytechar(0x9F)):
order = 188 * (_byteord(aStr[0]) - 0x81)
elif (_bytechar(0xE0) <= aStr[0] <= _bytechar(0xEF)):
order = 188 * (_byteord(aStr[0]) - 0xE0 + 31)
else:
return -1;
order = order + ord(aStr[1]) - 0x40
if aStr[1] > '\x7F':
order =- 1
order = order + _byteord(aStr[1]) - 0x40
if aStr[1] > _bytechar(0x7F):
order = -1
return order

class EUCJPDistributionAnalysis(CharDistributionAnalysis):
Expand All @@ -194,7 +194,7 @@ def get_order(self, aStr):
# first byte range: 0xa0 -- 0xfe
# second byte range: 0xa1 -- 0xfe
# no validation needed here. State machine has done that
if aStr[0] >= '\xA0':
return 94 * (ord(aStr[0]) - 0xA1) + ord(aStr[1]) - 0xa1
if aStr[0] >= _bytechar(0xA0):
return 94 * (_byteord(aStr[0]) - 0xA1) + _byteord(aStr[1]) - 0xa1
else:
return -1
7 changes: 4 additions & 3 deletions chardet/charsetgroupprober.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################

import constants, sys
import sys
from chardet import constants
from charsetprober import CharSetProber

class CharSetGroupProber(CharSetProber):
Expand All @@ -41,7 +42,7 @@ def reset(self):
for prober in self._mProbers:
if prober:
prober.reset()
prober.active = constants.True
prober.active = True
self._mActiveNum += 1
self._mBestGuessProber = None

Expand All @@ -62,7 +63,7 @@ def feed(self, aBuf):
self._mBestGuessProber = prober
return self.get_state()
elif st == constants.eNotMe:
prober.active = constants.False
prober.active = False
self._mActiveNum -= 1
if self._mActiveNum <= 0:
self._mState = constants.eNotMe
Expand Down
8 changes: 5 additions & 3 deletions chardet/charsetprober.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################

import constants, re
import re
from chardet import constants
from chardet.compat import _b

class CharSetProber:
def __init__(self):
Expand All @@ -48,11 +50,11 @@ def get_confidence(self):
return 0.0

def filter_high_bit_only(self, aBuf):
aBuf = re.sub(r'([\x00-\x7F])+', ' ', aBuf)
aBuf = re.sub(_b(r'([\x00-\x7F])+'), _b(' '), aBuf)
return aBuf

def filter_without_english_letters(self, aBuf):
aBuf = re.sub(r'([A-Za-z])+', ' ', aBuf)
aBuf = re.sub(_b(r'([A-Za-z])+'), _b(' '), aBuf)
return aBuf

def filter_with_english_letters(self, aBuf):
Expand Down
5 changes: 3 additions & 2 deletions chardet/codingstatemachine.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,8 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################

from constants import eStart, eError, eItsMe
from chardet.compat import _byteord
from chardet.constants import eStart, eError

class CodingStateMachine:
def __init__(self, sm):
Expand All @@ -41,7 +42,7 @@ def next_state(self, c):
# for each byte we get its class
# if it is first byte, we also get byte length
try:
byteCls = self._mModel['classTable'][ord(c)]
byteCls = self._mModel['classTable'][_byteord(c)]
except IndexError:
return eError
if self._mCurrentState == eStart:
Expand Down
10 changes: 10 additions & 0 deletions chardet/compat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import sys

if sys.version_info >= (3, ):
_b = lambda _: _.encode('ascii')
_bytechar = int
_byteord = int
else:
_b = str
_bytechar = chr
_byteord = ord
8 changes: 0 additions & 8 deletions chardet/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,3 @@
eItsMe = 2

SHORTCUT_THRESHOLD = 0.95

import __builtin__
if not hasattr(__builtin__, 'False'):
False = 0
True = 1
else:
False = __builtin__.False
True = __builtin__.True
6 changes: 3 additions & 3 deletions chardet/escprober.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################

import constants, sys
from chardet import constants
from escsm import HZSMModel, ISO2022CNSMModel, ISO2022JPSMModel, ISO2022KRSMModel
from charsetprober import CharSetProber
from codingstatemachine import CodingStateMachine
Expand All @@ -45,7 +45,7 @@ def reset(self):
CharSetProber.reset(self)
for codingSM in self._mCodingSM:
if not codingSM: continue
codingSM.active = constants.True
codingSM.active = True
codingSM.reset()
self._mActiveSM = len(self._mCodingSM)
self._mDetectedCharset = None
Expand All @@ -66,7 +66,7 @@ def feed(self, aBuf):
if not codingSM.active: continue
codingState = codingSM.next_state(c)
if codingState == constants.eError:
codingSM.active = constants.False
codingSM.active = False
self._mActiveSM -= 1
if self._mActiveSM <= 0:
self._mState = constants.eNotMe
Expand Down
2 changes: 1 addition & 1 deletion chardet/escsm.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################

from constants import eStart, eError, eItsMe
from chardet.constants import eStart, eError, eItsMe

HZ_cls = ( \
1,0,0,0,0,0,0,0, # 00 - 07
Expand Down
5 changes: 3 additions & 2 deletions chardet/eucjpprober.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,9 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################

import constants, sys
from constants import eStart, eError, eItsMe
import sys
from chardet import constants
from chardet.constants import eStart, eError, eItsMe
from mbcharsetprober import MultiByteCharSetProber
from codingstatemachine import CodingStateMachine
from chardistribution import EUCJPDistributionAnalysis
Expand Down
2 changes: 1 addition & 1 deletion chardet/hebrewprober.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
######################### END LICENSE BLOCK #########################

from charsetprober import CharSetProber
import constants
from chardet import constants

# This prober doesn't actually recognize a language or a charset.
# It is a helper prober for the use of the Hebrew model probers
Expand Down
30 changes: 14 additions & 16 deletions chardet/jpcntx.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################

import constants
from chardet.compat import _bytechar, _byteord

NUM_OF_CATEGORY = 6
DONT_KNOW = -1
Expand Down Expand Up @@ -129,7 +129,7 @@ def reset(self):
self._mRelSample = [0] * NUM_OF_CATEGORY # category counters, each interger counts sequence in its category
self._mNeedToSkipCharNum = 0 # if last byte in current buffer is not the last byte of a character, we need to know how many bytes to skip in next buffer
self._mLastCharOrder = -1 # The order of previous char
self._mDone = constants.False # If this flag is set to constants.True, detection is done and conclusion has been made
self._mDone = False # If this flag is set to True, detection is done and conclusion has been made

def feed(self, aBuf, aLen):
if self._mDone: return
Expand All @@ -151,7 +151,7 @@ def feed(self, aBuf, aLen):
if (order != -1) and (self._mLastCharOrder != -1):
self._mTotalRel += 1
if self._mTotalRel > MAX_REL_THRESHOLD:
self._mDone = constants.True
self._mDone = True
break
self._mRelSample[jp2CharContext[self._mLastCharOrder][order]] += 1
self._mLastCharOrder = order
Expand All @@ -174,8 +174,8 @@ def get_order(self, aStr):
if not aStr: return -1, 1
# find out current char's byte length
try:
if ((aStr[0] >= '\x81') and (aStr[0] <= '\x9F')) or \
((aStr[0] >= '\xE0') and (aStr[0] <= '\xFC')):
if (_bytechar(0x81) <= aStr[0] <= _bytechar(0x9F)) or \
(_bytechar(0xE0) <= aStr[0] <= _bytechar(0xFC)):
charLen = 2
else:
charLen = 1
Expand All @@ -184,10 +184,9 @@ def get_order(self, aStr):

# return its order if it is hiragana
if len(aStr) > 1:
if (aStr[0] == '\202') and \
(aStr[1] >= '\x9F') and \
(aStr[1] <= '\xF1'):
return ord(aStr[1]) - 0x9F, charLen
if (aStr[0] == _bytechar(202)) and \
(_bytechar(0x9F) <= aStr[1] <= _bytechar(0xF1)):
return _byteord(aStr[1]) - 0x9F, charLen

return -1, charLen

Expand All @@ -196,10 +195,10 @@ def get_order(self, aStr):
if not aStr: return -1, 1
# find out current char's byte length
try:
if (aStr[0] == '\x8E') or \
((aStr[0] >= '\xA1') and (aStr[0] <= '\xFE')):
if (aStr[0] == _bytechar(0x8E)) or \
(_bytechar(0xA1) <= aStr[0] <= _bytechar(0xFE)):
charLen = 2
elif aStr[0] == '\x8F':
elif aStr[0] == _bytechar(0x8F):
charLen = 3
else:
charLen = 1
Expand All @@ -208,9 +207,8 @@ def get_order(self, aStr):

# return its order if it is hiragana
if len(aStr) > 1:
if (aStr[0] == '\xA4') and \
(aStr[1] >= '\xA1') and \
(aStr[1] <= '\xF3'):
return ord(aStr[1]) - 0xA1, charLen
if (aStr[0] == _bytechar(0xA4)) and \
(_bytechar(0xA1) <= aStr[1] <= _bytechar(0xF3)):
return _byteord(aStr[1]) - 0xA1, charLen

return -1, charLen
6 changes: 2 additions & 4 deletions chardet/langbulgarianmodel.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,6 @@
# 02110-1301 USA
######################### END LICENSE BLOCK #########################

import constants

# 255: Control characters that usually does not exist in any text
# 254: Carriage/Return
# 253: symbol (punctuation) that does not belong to word
Expand Down Expand Up @@ -215,14 +213,14 @@
'charToOrderMap': Latin5_BulgarianCharToOrderMap,
'precedenceMatrix': BulgarianLangModel,
'mTypicalPositiveRatio': 0.969392,
'keepEnglishLetter': constants.False,
'keepEnglishLetter': False,
'charsetName': "ISO-8859-5"
}

Win1251BulgarianModel = { \
'charToOrderMap': win1251BulgarianCharToOrderMap,
'precedenceMatrix': BulgarianLangModel,
'mTypicalPositiveRatio': 0.969392,
'keepEnglishLetter': constants.False,
'keepEnglishLetter': False,
'charsetName': "windows-1251"
}
Loading