md.py 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615
  1. from functools import lru_cache
  2. from logging import getLogger
  3. from typing import List, Optional
  4. from .constant import (
  5. COMMON_SAFE_ASCII_CHARACTERS,
  6. TRACE,
  7. UNICODE_SECONDARY_RANGE_KEYWORD,
  8. )
  9. from .utils import (
  10. is_accentuated,
  11. is_arabic,
  12. is_arabic_isolated_form,
  13. is_case_variable,
  14. is_cjk,
  15. is_emoticon,
  16. is_hangul,
  17. is_hiragana,
  18. is_katakana,
  19. is_latin,
  20. is_punctuation,
  21. is_separator,
  22. is_symbol,
  23. is_thai,
  24. is_unprintable,
  25. remove_accent,
  26. unicode_range,
  27. )
  28. class MessDetectorPlugin:
  29. """
  30. Base abstract class used for mess detection plugins.
  31. All detectors MUST extend and implement given methods.
  32. """
  33. def eligible(self, character: str) -> bool:
  34. """
  35. Determine if given character should be fed in.
  36. """
  37. raise NotImplementedError # pragma: nocover
  38. def feed(self, character: str) -> None:
  39. """
  40. The main routine to be executed upon character.
  41. Insert the logic in witch the text would be considered chaotic.
  42. """
  43. raise NotImplementedError # pragma: nocover
  44. def reset(self) -> None: # pragma: no cover
  45. """
  46. Permit to reset the plugin to the initial state.
  47. """
  48. raise NotImplementedError
  49. @property
  50. def ratio(self) -> float:
  51. """
  52. Compute the chaos ratio based on what your feed() has seen.
  53. Must NOT be lower than 0.; No restriction gt 0.
  54. """
  55. raise NotImplementedError # pragma: nocover
  56. class TooManySymbolOrPunctuationPlugin(MessDetectorPlugin):
  57. def __init__(self) -> None:
  58. self._punctuation_count: int = 0
  59. self._symbol_count: int = 0
  60. self._character_count: int = 0
  61. self._last_printable_char: Optional[str] = None
  62. self._frenzy_symbol_in_word: bool = False
  63. def eligible(self, character: str) -> bool:
  64. return character.isprintable()
  65. def feed(self, character: str) -> None:
  66. self._character_count += 1
  67. if (
  68. character != self._last_printable_char
  69. and character not in COMMON_SAFE_ASCII_CHARACTERS
  70. ):
  71. if is_punctuation(character):
  72. self._punctuation_count += 1
  73. elif (
  74. character.isdigit() is False
  75. and is_symbol(character)
  76. and is_emoticon(character) is False
  77. ):
  78. self._symbol_count += 2
  79. self._last_printable_char = character
  80. def reset(self) -> None: # pragma: no cover
  81. self._punctuation_count = 0
  82. self._character_count = 0
  83. self._symbol_count = 0
  84. @property
  85. def ratio(self) -> float:
  86. if self._character_count == 0:
  87. return 0.0
  88. ratio_of_punctuation: float = (
  89. self._punctuation_count + self._symbol_count
  90. ) / self._character_count
  91. return ratio_of_punctuation if ratio_of_punctuation >= 0.3 else 0.0
  92. class TooManyAccentuatedPlugin(MessDetectorPlugin):
  93. def __init__(self) -> None:
  94. self._character_count: int = 0
  95. self._accentuated_count: int = 0
  96. def eligible(self, character: str) -> bool:
  97. return character.isalpha()
  98. def feed(self, character: str) -> None:
  99. self._character_count += 1
  100. if is_accentuated(character):
  101. self._accentuated_count += 1
  102. def reset(self) -> None: # pragma: no cover
  103. self._character_count = 0
  104. self._accentuated_count = 0
  105. @property
  106. def ratio(self) -> float:
  107. if self._character_count < 8:
  108. return 0.0
  109. ratio_of_accentuation: float = self._accentuated_count / self._character_count
  110. return ratio_of_accentuation if ratio_of_accentuation >= 0.35 else 0.0
  111. class UnprintablePlugin(MessDetectorPlugin):
  112. def __init__(self) -> None:
  113. self._unprintable_count: int = 0
  114. self._character_count: int = 0
  115. def eligible(self, character: str) -> bool:
  116. return True
  117. def feed(self, character: str) -> None:
  118. if is_unprintable(character):
  119. self._unprintable_count += 1
  120. self._character_count += 1
  121. def reset(self) -> None: # pragma: no cover
  122. self._unprintable_count = 0
  123. @property
  124. def ratio(self) -> float:
  125. if self._character_count == 0:
  126. return 0.0
  127. return (self._unprintable_count * 8) / self._character_count
  128. class SuspiciousDuplicateAccentPlugin(MessDetectorPlugin):
  129. def __init__(self) -> None:
  130. self._successive_count: int = 0
  131. self._character_count: int = 0
  132. self._last_latin_character: Optional[str] = None
  133. def eligible(self, character: str) -> bool:
  134. return character.isalpha() and is_latin(character)
  135. def feed(self, character: str) -> None:
  136. self._character_count += 1
  137. if (
  138. self._last_latin_character is not None
  139. and is_accentuated(character)
  140. and is_accentuated(self._last_latin_character)
  141. ):
  142. if character.isupper() and self._last_latin_character.isupper():
  143. self._successive_count += 1
  144. # Worse if its the same char duplicated with different accent.
  145. if remove_accent(character) == remove_accent(self._last_latin_character):
  146. self._successive_count += 1
  147. self._last_latin_character = character
  148. def reset(self) -> None: # pragma: no cover
  149. self._successive_count = 0
  150. self._character_count = 0
  151. self._last_latin_character = None
  152. @property
  153. def ratio(self) -> float:
  154. if self._character_count == 0:
  155. return 0.0
  156. return (self._successive_count * 2) / self._character_count
  157. class SuspiciousRange(MessDetectorPlugin):
  158. def __init__(self) -> None:
  159. self._suspicious_successive_range_count: int = 0
  160. self._character_count: int = 0
  161. self._last_printable_seen: Optional[str] = None
  162. def eligible(self, character: str) -> bool:
  163. return character.isprintable()
  164. def feed(self, character: str) -> None:
  165. self._character_count += 1
  166. if (
  167. character.isspace()
  168. or is_punctuation(character)
  169. or character in COMMON_SAFE_ASCII_CHARACTERS
  170. ):
  171. self._last_printable_seen = None
  172. return
  173. if self._last_printable_seen is None:
  174. self._last_printable_seen = character
  175. return
  176. unicode_range_a: Optional[str] = unicode_range(self._last_printable_seen)
  177. unicode_range_b: Optional[str] = unicode_range(character)
  178. if is_suspiciously_successive_range(unicode_range_a, unicode_range_b):
  179. self._suspicious_successive_range_count += 1
  180. self._last_printable_seen = character
  181. def reset(self) -> None: # pragma: no cover
  182. self._character_count = 0
  183. self._suspicious_successive_range_count = 0
  184. self._last_printable_seen = None
  185. @property
  186. def ratio(self) -> float:
  187. if self._character_count <= 24:
  188. return 0.0
  189. ratio_of_suspicious_range_usage: float = (
  190. self._suspicious_successive_range_count * 2
  191. ) / self._character_count
  192. return ratio_of_suspicious_range_usage
  193. class SuperWeirdWordPlugin(MessDetectorPlugin):
  194. def __init__(self) -> None:
  195. self._word_count: int = 0
  196. self._bad_word_count: int = 0
  197. self._foreign_long_count: int = 0
  198. self._is_current_word_bad: bool = False
  199. self._foreign_long_watch: bool = False
  200. self._character_count: int = 0
  201. self._bad_character_count: int = 0
  202. self._buffer: str = ""
  203. self._buffer_accent_count: int = 0
  204. def eligible(self, character: str) -> bool:
  205. return True
  206. def feed(self, character: str) -> None:
  207. if character.isalpha():
  208. self._buffer += character
  209. if is_accentuated(character):
  210. self._buffer_accent_count += 1
  211. if (
  212. self._foreign_long_watch is False
  213. and (is_latin(character) is False or is_accentuated(character))
  214. and is_cjk(character) is False
  215. and is_hangul(character) is False
  216. and is_katakana(character) is False
  217. and is_hiragana(character) is False
  218. and is_thai(character) is False
  219. ):
  220. self._foreign_long_watch = True
  221. return
  222. if not self._buffer:
  223. return
  224. if (
  225. character.isspace() or is_punctuation(character) or is_separator(character)
  226. ) and self._buffer:
  227. self._word_count += 1
  228. buffer_length: int = len(self._buffer)
  229. self._character_count += buffer_length
  230. if buffer_length >= 4:
  231. if self._buffer_accent_count / buffer_length > 0.34:
  232. self._is_current_word_bad = True
  233. # Word/Buffer ending with an upper case accentuated letter are so rare,
  234. # that we will consider them all as suspicious. Same weight as foreign_long suspicious.
  235. if (
  236. is_accentuated(self._buffer[-1])
  237. and self._buffer[-1].isupper()
  238. and all(_.isupper() for _ in self._buffer) is False
  239. ):
  240. self._foreign_long_count += 1
  241. self._is_current_word_bad = True
  242. if buffer_length >= 24 and self._foreign_long_watch:
  243. camel_case_dst = [
  244. i
  245. for c, i in zip(self._buffer, range(0, buffer_length))
  246. if c.isupper()
  247. ]
  248. probable_camel_cased: bool = False
  249. if camel_case_dst and (len(camel_case_dst) / buffer_length <= 0.3):
  250. probable_camel_cased = True
  251. if not probable_camel_cased:
  252. self._foreign_long_count += 1
  253. self._is_current_word_bad = True
  254. if self._is_current_word_bad:
  255. self._bad_word_count += 1
  256. self._bad_character_count += len(self._buffer)
  257. self._is_current_word_bad = False
  258. self._foreign_long_watch = False
  259. self._buffer = ""
  260. self._buffer_accent_count = 0
  261. elif (
  262. character not in {"<", ">", "-", "=", "~", "|", "_"}
  263. and character.isdigit() is False
  264. and is_symbol(character)
  265. ):
  266. self._is_current_word_bad = True
  267. self._buffer += character
  268. def reset(self) -> None: # pragma: no cover
  269. self._buffer = ""
  270. self._is_current_word_bad = False
  271. self._foreign_long_watch = False
  272. self._bad_word_count = 0
  273. self._word_count = 0
  274. self._character_count = 0
  275. self._bad_character_count = 0
  276. self._foreign_long_count = 0
  277. @property
  278. def ratio(self) -> float:
  279. if self._word_count <= 10 and self._foreign_long_count == 0:
  280. return 0.0
  281. return self._bad_character_count / self._character_count
  282. class CjkInvalidStopPlugin(MessDetectorPlugin):
  283. """
  284. GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
  285. can be easily detected. Searching for the overuse of '丅' and '丄'.
  286. """
  287. def __init__(self) -> None:
  288. self._wrong_stop_count: int = 0
  289. self._cjk_character_count: int = 0
  290. def eligible(self, character: str) -> bool:
  291. return True
  292. def feed(self, character: str) -> None:
  293. if character in {"丅", "丄"}:
  294. self._wrong_stop_count += 1
  295. return
  296. if is_cjk(character):
  297. self._cjk_character_count += 1
  298. def reset(self) -> None: # pragma: no cover
  299. self._wrong_stop_count = 0
  300. self._cjk_character_count = 0
  301. @property
  302. def ratio(self) -> float:
  303. if self._cjk_character_count < 16:
  304. return 0.0
  305. return self._wrong_stop_count / self._cjk_character_count
  306. class ArchaicUpperLowerPlugin(MessDetectorPlugin):
  307. def __init__(self) -> None:
  308. self._buf: bool = False
  309. self._character_count_since_last_sep: int = 0
  310. self._successive_upper_lower_count: int = 0
  311. self._successive_upper_lower_count_final: int = 0
  312. self._character_count: int = 0
  313. self._last_alpha_seen: Optional[str] = None
  314. self._current_ascii_only: bool = True
  315. def eligible(self, character: str) -> bool:
  316. return True
  317. def feed(self, character: str) -> None:
  318. is_concerned = character.isalpha() and is_case_variable(character)
  319. chunk_sep = is_concerned is False
  320. if chunk_sep and self._character_count_since_last_sep > 0:
  321. if (
  322. self._character_count_since_last_sep <= 64
  323. and character.isdigit() is False
  324. and self._current_ascii_only is False
  325. ):
  326. self._successive_upper_lower_count_final += (
  327. self._successive_upper_lower_count
  328. )
  329. self._successive_upper_lower_count = 0
  330. self._character_count_since_last_sep = 0
  331. self._last_alpha_seen = None
  332. self._buf = False
  333. self._character_count += 1
  334. self._current_ascii_only = True
  335. return
  336. if self._current_ascii_only is True and character.isascii() is False:
  337. self._current_ascii_only = False
  338. if self._last_alpha_seen is not None:
  339. if (character.isupper() and self._last_alpha_seen.islower()) or (
  340. character.islower() and self._last_alpha_seen.isupper()
  341. ):
  342. if self._buf is True:
  343. self._successive_upper_lower_count += 2
  344. self._buf = False
  345. else:
  346. self._buf = True
  347. else:
  348. self._buf = False
  349. self._character_count += 1
  350. self._character_count_since_last_sep += 1
  351. self._last_alpha_seen = character
  352. def reset(self) -> None: # pragma: no cover
  353. self._character_count = 0
  354. self._character_count_since_last_sep = 0
  355. self._successive_upper_lower_count = 0
  356. self._successive_upper_lower_count_final = 0
  357. self._last_alpha_seen = None
  358. self._buf = False
  359. self._current_ascii_only = True
  360. @property
  361. def ratio(self) -> float:
  362. if self._character_count == 0:
  363. return 0.0
  364. return self._successive_upper_lower_count_final / self._character_count
  365. class ArabicIsolatedFormPlugin(MessDetectorPlugin):
  366. def __init__(self) -> None:
  367. self._character_count: int = 0
  368. self._isolated_form_count: int = 0
  369. def reset(self) -> None: # pragma: no cover
  370. self._character_count = 0
  371. self._isolated_form_count = 0
  372. def eligible(self, character: str) -> bool:
  373. return is_arabic(character)
  374. def feed(self, character: str) -> None:
  375. self._character_count += 1
  376. if is_arabic_isolated_form(character):
  377. self._isolated_form_count += 1
  378. @property
  379. def ratio(self) -> float:
  380. if self._character_count < 8:
  381. return 0.0
  382. isolated_form_usage: float = self._isolated_form_count / self._character_count
  383. return isolated_form_usage
  384. @lru_cache(maxsize=1024)
  385. def is_suspiciously_successive_range(
  386. unicode_range_a: Optional[str], unicode_range_b: Optional[str]
  387. ) -> bool:
  388. """
  389. Determine if two Unicode range seen next to each other can be considered as suspicious.
  390. """
  391. if unicode_range_a is None or unicode_range_b is None:
  392. return True
  393. if unicode_range_a == unicode_range_b:
  394. return False
  395. if "Latin" in unicode_range_a and "Latin" in unicode_range_b:
  396. return False
  397. if "Emoticons" in unicode_range_a or "Emoticons" in unicode_range_b:
  398. return False
  399. # Latin characters can be accompanied with a combining diacritical mark
  400. # eg. Vietnamese.
  401. if ("Latin" in unicode_range_a or "Latin" in unicode_range_b) and (
  402. "Combining" in unicode_range_a or "Combining" in unicode_range_b
  403. ):
  404. return False
  405. keywords_range_a, keywords_range_b = unicode_range_a.split(
  406. " "
  407. ), unicode_range_b.split(" ")
  408. for el in keywords_range_a:
  409. if el in UNICODE_SECONDARY_RANGE_KEYWORD:
  410. continue
  411. if el in keywords_range_b:
  412. return False
  413. # Japanese Exception
  414. range_a_jp_chars, range_b_jp_chars = (
  415. unicode_range_a
  416. in (
  417. "Hiragana",
  418. "Katakana",
  419. ),
  420. unicode_range_b in ("Hiragana", "Katakana"),
  421. )
  422. if (range_a_jp_chars or range_b_jp_chars) and (
  423. "CJK" in unicode_range_a or "CJK" in unicode_range_b
  424. ):
  425. return False
  426. if range_a_jp_chars and range_b_jp_chars:
  427. return False
  428. if "Hangul" in unicode_range_a or "Hangul" in unicode_range_b:
  429. if "CJK" in unicode_range_a or "CJK" in unicode_range_b:
  430. return False
  431. if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
  432. return False
  433. # Chinese/Japanese use dedicated range for punctuation and/or separators.
  434. if ("CJK" in unicode_range_a or "CJK" in unicode_range_b) or (
  435. unicode_range_a in ["Katakana", "Hiragana"]
  436. and unicode_range_b in ["Katakana", "Hiragana"]
  437. ):
  438. if "Punctuation" in unicode_range_a or "Punctuation" in unicode_range_b:
  439. return False
  440. if "Forms" in unicode_range_a or "Forms" in unicode_range_b:
  441. return False
  442. if unicode_range_a == "Basic Latin" or unicode_range_b == "Basic Latin":
  443. return False
  444. return True
  445. @lru_cache(maxsize=2048)
  446. def mess_ratio(
  447. decoded_sequence: str, maximum_threshold: float = 0.2, debug: bool = False
  448. ) -> float:
  449. """
  450. Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
  451. """
  452. detectors: List[MessDetectorPlugin] = [
  453. md_class() for md_class in MessDetectorPlugin.__subclasses__()
  454. ]
  455. length: int = len(decoded_sequence) + 1
  456. mean_mess_ratio: float = 0.0
  457. if length < 512:
  458. intermediary_mean_mess_ratio_calc: int = 32
  459. elif length <= 1024:
  460. intermediary_mean_mess_ratio_calc = 64
  461. else:
  462. intermediary_mean_mess_ratio_calc = 128
  463. for character, index in zip(decoded_sequence + "\n", range(length)):
  464. for detector in detectors:
  465. if detector.eligible(character):
  466. detector.feed(character)
  467. if (
  468. index > 0 and index % intermediary_mean_mess_ratio_calc == 0
  469. ) or index == length - 1:
  470. mean_mess_ratio = sum(dt.ratio for dt in detectors)
  471. if mean_mess_ratio >= maximum_threshold:
  472. break
  473. if debug:
  474. logger = getLogger("charset_normalizer")
  475. logger.log(
  476. TRACE,
  477. "Mess-detector extended-analysis start. "
  478. f"intermediary_mean_mess_ratio_calc={intermediary_mean_mess_ratio_calc} mean_mess_ratio={mean_mess_ratio} "
  479. f"maximum_threshold={maximum_threshold}",
  480. )
  481. if len(decoded_sequence) > 16:
  482. logger.log(TRACE, f"Starting with: {decoded_sequence[:16]}")
  483. logger.log(TRACE, f"Ending with: {decoded_sequence[-16::]}")
  484. for dt in detectors: # pragma: nocover
  485. logger.log(TRACE, f"{dt.__class__}: {dt.ratio}")
  486. return round(mean_mess_ratio, 3)