chardet/convert_language_model.py at d83887c020da4b0f0f03a0aa70e21e520d9a0c96 · chardet/chardet

History

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

#!/usr/bin/env python

######################## BEGIN LICENSE BLOCK ########################

# Contributor(s):

# 10.02.2015 - helour - first attempt

# 08.08.2016 - Dan Blanchard - first usable release

#

# This library is free software; you can redistribute it and/or

# modify it under the terms of the GNU Lesser General Public

# License as published by the Free Software Foundation; either

# version 2.1 of the License, or (at your option) any later version.

#

# This library is distributed in the hope that it will be useful,

# but WITHOUT ANY WARRANTY; without even the implied warranty of

# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

# Lesser General Public License for more details.

#

# You should have received a copy of the GNU Lesser General Public

# License along with this library; if not, write to the Free Software

# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA

# 02110-1301 USA

#

######################### END LICENSE BLOCK #########################

"""

Convert old style SBCS model to new

"""

import os

import sys

from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser

from string import ascii_letters

import chardet

from chardet import __version__

from chardet.metadata.languages import LANGUAGES

from chardet.sbcharsetprober import SingleByteCharSetModel

# Turn ascii_letters into a set to make other ops easier

ascii_letters = set(ascii_letters)

def normalize_name(charset_name):

"""Convert name to proper Python constant format"""

# Title case to start

charset_name = charset_name.upper()

# Underscores instead of hyphens

charset_name = charset_name.replace("-", "_")

return charset_name

def convert_sbcs_model(old_model, alphabet):

"""Create a SingleByteCharSetModel object representing the charset."""

# Setup tables necessary for computing transition frequencies for model

char_to_order = {i: order for i, order in enumerate(old_model["char_to_order_map"])}

pos_ratio = old_model["typical_positive_ratio"]

keep_ascii_letters = old_model["keep_english_letter"]

curr_model = SingleByteCharSetModel(

charset_name=old_model["charset_name"],

language=old_model["language"],

char_to_order_map=char_to_order,

# language_model is filled in later

language_model=None,

typical_positive_ratio=pos_ratio,

keep_ascii_letters=keep_ascii_letters,

alphabet=alphabet,

)

return curr_model

def print_char_to_order(var_name, order_map, charset_name, output_file):

print(f"{var_name} = {{", file=output_file)

for char, order in sorted(order_map.items()):

char_bytes = bytes(bytearray((char,)))

try:

unicode_char = char_bytes.decode(charset_name)

except UnicodeError:

unicode_char = None

print(f" {char!r}: {order!r}, # {unicode_char!r}", file=output_file)

print("}\n", file=output_file)

def print_language_model(var_name, language_model, output_file, char_ranks):

print(

"# 3: Positive\n" "# 2: Likely\n" "# 1: Unlikely\n" "# 0: Negative\n",

file=output_file,

)

print(f"{var_name} = {{", file=output_file)

for first_char, sub_dict in sorted(language_model.items()):

# Skip empty sub_dicts

if not sub_dict or first_char not in char_ranks:

continue

print(f" {char_ranks[first_char]!r}: {{ # {first_char!r}", file=output_file)

for second_char, likelihood in sorted(sub_dict.items()):

if second_char not in char_ranks:

continue

print(

f" {char_ranks[second_char]!r}: {likelihood!r}, # "

f"{second_char!r}",

file=output_file,

)

print(" },", file=output_file)

print("}\n", file=output_file)

def convert_models_for_lang(language):

"""Convert old SingleByteCharSetModels for the given language"""

# Validate language

language = language.title()

lang_metadata = LANGUAGES.get(language)

if not lang_metadata:

raise ValueError(

f"Unknown language: {language}. If you are adding a model for a"

" new language, you must first update metadata/"

"languages.py"

)

lang_mod_name = f"lang{language.lower()}model"

if not os.path.exists(os.path.join("chardet", lang_mod_name + ".py")):

print(f"Skipping {language} because it does not have an old model.")

return

lang_mod = getattr(chardet, lang_mod_name)

print(

f"\n{language}\n----------------------------------------------------------------"

)

print(f"Keep ASCII Letters: {lang_metadata.use_ascii}")

print(f"Alphabet: {lang_metadata.alphabet}")

# Create char-to-order maps (aka char-to-rank dicts)

charset_models = {}

char_ranks = {}

order_to_chars = {}

for var_name in dir(lang_mod):

if not ("Model" in var_name and "LangModel" not in var_name):

continue

old_model = getattr(lang_mod, var_name)

charset_name = old_model["charset_name"]

print(f"Converting charset model for {charset_name}")

sys.stdout.flush()

charset_models[charset_name] = convert_sbcs_model(

old_model, lang_metadata.alphabet

)

# Since we don't know which charsets have which characters, we have to

# try to reconstruct char_ranks (for letters only, since that's all

# the old language models contain)

for byte_hex, order in charset_models[charset_name].char_to_order_map.items():

# order 64 was basically ignored before because of the off by one

# error, but it's hard to know if training took that into account

if order > 64:

continue

# Convert to bytes in Python 2 and 3

char = bytes(bytearray((byte_hex,)))

try:

unicode_char = char.decode(charset_name)

except UnicodeDecodeError:

continue

if unicode_char not in char_ranks:

char_ranks[unicode_char] = order

order_to_chars[order] = unicode_char

elif char_ranks[unicode_char] != order:

raise ValueError(f"Unstable character ranking for {unicode_char}")

old_lang_model = getattr(lang_mod, f"{language.title()}LangModel")

language_model = {}

# Preserve off-by-one error here by ignoring first column and row

for i in range(1, 64):

if i not in order_to_chars:

continue

lang_char = order_to_chars[i]

language_model[lang_char] = {}

for j in range(1, 64):

if j not in order_to_chars:

continue

lang_char2 = order_to_chars[j]

language_model[lang_char][lang_char2] = old_lang_model[(i * 64) + j]

# Write output files

print(f"Writing output file for {language}\n\n")

sys.stdout.flush()

with open(f"lang{language.lower()}model.py", "w") as output_file:

upper_lang = language.upper()

# print header to set encoding

print(

"#!/usr/bin/env python\n"

"# -*- coding: utf-8 -*-\n\n"

"from chardet.sbcharsetprober import SingleByteCharSetModel\n\n",

file=output_file,

)

lm_name = f"{upper_lang}_LANG_MODEL"

print_language_model(lm_name, language_model, output_file, char_ranks)

print(

"# 255: Undefined characters that did not exist in training text\n"

"# 254: Carriage/Return\n"

"# 253: symbol (punctuation) that does not belong to word\n"

"# 252: 0 - 9\n"

"# 251: Control characters\n\n"

"# Character Mapping Table(s):",

file=output_file,

)

for charset_name, sbcs_model in charset_models.items():

normal_name = normalize_name(charset_name)

char_to_order_name = f"{normal_name}_{upper_lang}_CHAR_TO_ORDER"

print_char_to_order(

char_to_order_name,

sbcs_model.char_to_order_map,

charset_name,

output_file,

)

sbcs_model_name = f"{normal_name}_{upper_lang}_MODEL"

sbcs_model.char_to_order_map.clear()

sbcs_model_repr = (

repr(sbcs_model)

.replace("None", lm_name)

.replace("{}", char_to_order_name)

.replace(", ", (",\n" + " " * (len(sbcs_model_name) + 26)))

)

print(f"{sbcs_model_name} = {sbcs_model_repr}\n", file=output_file)

def main():

parser = ArgumentParser(

description=__doc__, formatter_class=ArgumentDefaultsHelpFormatter

)

parser.add_argument(

"language",

help="The name of the language the input documents are "

"in. Also the name of the language the generated "

"model will detect. If no language is specified, "

"models for all languages known to chardet will be"

" trained.",

nargs="*",

)

parser.add_argument("--version", action="version", version=__version__)

args = parser.parse_args()

if not args.language:

args.language = list(sorted(LANGUAGES.keys()))

for language in args.language:

convert_models_for_lang(language)

if __name__ == "__main__":

main()

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

convert_language_model.py

convert_language_model.py

Files

convert_language_model.py

Latest commit

History

convert_language_model.py

File metadata and controls