__main__.py 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296
  1. import argparse
  2. import sys
  3. from json import dumps
  4. from os.path import abspath, basename, dirname, join, realpath
  5. from platform import python_version
  6. from typing import List, Optional
  7. from unicodedata import unidata_version
  8. import charset_normalizer.md as md_module
  9. from charset_normalizer import from_fp
  10. from charset_normalizer.models import CliDetectionResult
  11. from charset_normalizer.version import __version__
  12. def query_yes_no(question: str, default: str = "yes") -> bool:
  13. """Ask a yes/no question via input() and return their answer.
  14. "question" is a string that is presented to the user.
  15. "default" is the presumed answer if the user just hits <Enter>.
  16. It must be "yes" (the default), "no" or None (meaning
  17. an answer is required of the user).
  18. The "answer" return value is True for "yes" or False for "no".
  19. Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input
  20. """
  21. valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
  22. if default is None:
  23. prompt = " [y/n] "
  24. elif default == "yes":
  25. prompt = " [Y/n] "
  26. elif default == "no":
  27. prompt = " [y/N] "
  28. else:
  29. raise ValueError("invalid default answer: '%s'" % default)
  30. while True:
  31. sys.stdout.write(question + prompt)
  32. choice = input().lower()
  33. if default is not None and choice == "":
  34. return valid[default]
  35. elif choice in valid:
  36. return valid[choice]
  37. else:
  38. sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
  39. def cli_detect(argv: Optional[List[str]] = None) -> int:
  40. """
  41. CLI assistant using ARGV and ArgumentParser
  42. :param argv:
  43. :return: 0 if everything is fine, anything else equal trouble
  44. """
  45. parser = argparse.ArgumentParser(
  46. description="The Real First Universal Charset Detector. "
  47. "Discover originating encoding used on text file. "
  48. "Normalize text to unicode."
  49. )
  50. parser.add_argument(
  51. "files", type=argparse.FileType("rb"), nargs="+", help="File(s) to be analysed"
  52. )
  53. parser.add_argument(
  54. "-v",
  55. "--verbose",
  56. action="store_true",
  57. default=False,
  58. dest="verbose",
  59. help="Display complementary information about file if any. "
  60. "Stdout will contain logs about the detection process.",
  61. )
  62. parser.add_argument(
  63. "-a",
  64. "--with-alternative",
  65. action="store_true",
  66. default=False,
  67. dest="alternatives",
  68. help="Output complementary possibilities if any. Top-level JSON WILL be a list.",
  69. )
  70. parser.add_argument(
  71. "-n",
  72. "--normalize",
  73. action="store_true",
  74. default=False,
  75. dest="normalize",
  76. help="Permit to normalize input file. If not set, program does not write anything.",
  77. )
  78. parser.add_argument(
  79. "-m",
  80. "--minimal",
  81. action="store_true",
  82. default=False,
  83. dest="minimal",
  84. help="Only output the charset detected to STDOUT. Disabling JSON output.",
  85. )
  86. parser.add_argument(
  87. "-r",
  88. "--replace",
  89. action="store_true",
  90. default=False,
  91. dest="replace",
  92. help="Replace file when trying to normalize it instead of creating a new one.",
  93. )
  94. parser.add_argument(
  95. "-f",
  96. "--force",
  97. action="store_true",
  98. default=False,
  99. dest="force",
  100. help="Replace file without asking if you are sure, use this flag with caution.",
  101. )
  102. parser.add_argument(
  103. "-t",
  104. "--threshold",
  105. action="store",
  106. default=0.2,
  107. type=float,
  108. dest="threshold",
  109. help="Define a custom maximum amount of chaos allowed in decoded content. 0. <= chaos <= 1.",
  110. )
  111. parser.add_argument(
  112. "--version",
  113. action="version",
  114. version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format(
  115. __version__,
  116. python_version(),
  117. unidata_version,
  118. "OFF" if md_module.__file__.lower().endswith(".py") else "ON",
  119. ),
  120. help="Show version information and exit.",
  121. )
  122. args = parser.parse_args(argv)
  123. if args.replace is True and args.normalize is False:
  124. print("Use --replace in addition of --normalize only.", file=sys.stderr)
  125. return 1
  126. if args.force is True and args.replace is False:
  127. print("Use --force in addition of --replace only.", file=sys.stderr)
  128. return 1
  129. if args.threshold < 0.0 or args.threshold > 1.0:
  130. print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
  131. return 1
  132. x_ = []
  133. for my_file in args.files:
  134. matches = from_fp(my_file, threshold=args.threshold, explain=args.verbose)
  135. best_guess = matches.best()
  136. if best_guess is None:
  137. print(
  138. 'Unable to identify originating encoding for "{}". {}'.format(
  139. my_file.name,
  140. "Maybe try increasing maximum amount of chaos."
  141. if args.threshold < 1.0
  142. else "",
  143. ),
  144. file=sys.stderr,
  145. )
  146. x_.append(
  147. CliDetectionResult(
  148. abspath(my_file.name),
  149. None,
  150. [],
  151. [],
  152. "Unknown",
  153. [],
  154. False,
  155. 1.0,
  156. 0.0,
  157. None,
  158. True,
  159. )
  160. )
  161. else:
  162. x_.append(
  163. CliDetectionResult(
  164. abspath(my_file.name),
  165. best_guess.encoding,
  166. best_guess.encoding_aliases,
  167. [
  168. cp
  169. for cp in best_guess.could_be_from_charset
  170. if cp != best_guess.encoding
  171. ],
  172. best_guess.language,
  173. best_guess.alphabets,
  174. best_guess.bom,
  175. best_guess.percent_chaos,
  176. best_guess.percent_coherence,
  177. None,
  178. True,
  179. )
  180. )
  181. if len(matches) > 1 and args.alternatives:
  182. for el in matches:
  183. if el != best_guess:
  184. x_.append(
  185. CliDetectionResult(
  186. abspath(my_file.name),
  187. el.encoding,
  188. el.encoding_aliases,
  189. [
  190. cp
  191. for cp in el.could_be_from_charset
  192. if cp != el.encoding
  193. ],
  194. el.language,
  195. el.alphabets,
  196. el.bom,
  197. el.percent_chaos,
  198. el.percent_coherence,
  199. None,
  200. False,
  201. )
  202. )
  203. if args.normalize is True:
  204. if best_guess.encoding.startswith("utf") is True:
  205. print(
  206. '"{}" file does not need to be normalized, as it already came from unicode.'.format(
  207. my_file.name
  208. ),
  209. file=sys.stderr,
  210. )
  211. if my_file.closed is False:
  212. my_file.close()
  213. continue
  214. dir_path = dirname(realpath(my_file.name))
  215. file_name = basename(realpath(my_file.name))
  216. o_: List[str] = file_name.split(".")
  217. if args.replace is False:
  218. o_.insert(-1, best_guess.encoding)
  219. if my_file.closed is False:
  220. my_file.close()
  221. elif (
  222. args.force is False
  223. and query_yes_no(
  224. 'Are you sure to normalize "{}" by replacing it ?'.format(
  225. my_file.name
  226. ),
  227. "no",
  228. )
  229. is False
  230. ):
  231. if my_file.closed is False:
  232. my_file.close()
  233. continue
  234. try:
  235. x_[0].unicode_path = join(dir_path, ".".join(o_))
  236. with open(x_[0].unicode_path, "w", encoding="utf-8") as fp:
  237. fp.write(str(best_guess))
  238. except IOError as e:
  239. print(str(e), file=sys.stderr)
  240. if my_file.closed is False:
  241. my_file.close()
  242. return 2
  243. if my_file.closed is False:
  244. my_file.close()
  245. if args.minimal is False:
  246. print(
  247. dumps(
  248. [el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
  249. ensure_ascii=True,
  250. indent=4,
  251. )
  252. )
  253. else:
  254. for my_file in args.files:
  255. print(
  256. ", ".join(
  257. [
  258. el.encoding or "undefined"
  259. for el in x_
  260. if el.path == abspath(my_file.name)
  261. ]
  262. )
  263. )
  264. return 0
  265. if __name__ == "__main__":
  266. cli_detect()