easy_ocr.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151
  1. #!/usr/bin/python3
  2. # -*- coding: iso-8859-15 -*-
  3. """
  4. This file is part of the cintruder project, https://cintruder.03c8.net
  5. Copyright (c) 2012/2020 psy <epsylon@riseup.net>
  6. cintruder is free software; you can redistribute it and/or modify it under
  7. the terms of the GNU General Public License as published by the Free
  8. Software Foundation version 3 of the License.
  9. cintruder is distributed in the hope that it will be useful, but WITHOUT ANY
  10. WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  11. FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  12. details.
  13. You should have received a copy of the GNU General Public License along
  14. with cintruder; if not, write to the Free Software Foundation, Inc., 51
  15. Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  16. """
  17. from PIL import Image
  18. from operator import itemgetter
  19. import os, hashlib, time, sys, subprocess, platform
  20. import shutil
  21. class CIntruderOCR(object):
  22. """
  23. Class to apply OCR techniques to EasyCaptcha (http://kestas.kuliukas.com/EasyCaptcha/EasyCaptcha/easycaptcha.php)
  24. """
  25. def __init__(self, captcha, options):
  26. # generate words structure (+ previews for gui)
  27. if not os.path.exists("outputs/words/"):
  28. os.mkdir("outputs/words/")
  29. else:
  30. shutil.rmtree("outputs/words/")
  31. os.mkdir("outputs/words/")
  32. if not os.path.exists("core/images/previews/"):
  33. os.mkdir("core/images/previews/")
  34. else:
  35. shutil.rmtree("core/images/previews/")
  36. os.mkdir("core/images/previews/")
  37. if not os.path.exists("core/images/previews/ocr/"):
  38. os.mkdir("core/images/previews/ocr/")
  39. else:
  40. shutil.rmtree("core/images/previews/ocr/")
  41. os.mkdir("core/images/previews/ocr/")
  42. # initialize main CIntruder
  43. try:
  44. im = Image.open(captcha)
  45. im.save("core/images/previews/last-preview.gif")
  46. im2 = Image.new("P", im.size, 255)
  47. im = im.convert("P")
  48. except:
  49. print("[Error] Fail during OCR process!. Is that captcha supported?\n")
  50. return
  51. colourid = []
  52. try: # extract colour histogram
  53. hist = im.histogram()
  54. except:
  55. print("[Error] Something wrong extracting histogram. Aborting...\n")
  56. return
  57. values = {}
  58. for i in range(256):
  59. values[i] = hist[i]
  60. if options.verbose:
  61. print("[Info] Extracting advanced OCR info...")
  62. print("\n=============================")
  63. print("Image Histogram (order by >):")
  64. print("=============================")
  65. for j, k in sorted(list(values.items()), key=itemgetter(1), reverse=True)[:10]:
  66. colourid.append(j)
  67. if options.verbose:
  68. print("Colour ID: [ "+ str(j) + " ] -> Total pixels: " +str(k))
  69. if options.verbose:
  70. print("")
  71. temp = {}
  72. for x in range(im.size[1]):
  73. for y in range(im.size[0]):
  74. pix = im.getpixel((y, x))
  75. temp[pix] = pix
  76. if options.setids:
  77. colourid = int(options.setids)
  78. if pix == colourid:
  79. im2.putpixel((y, x), 0)
  80. else:
  81. if pix == colourid[1]: #id numbers of colours to get (*)
  82. im2.putpixel((y, x), 0)
  83. im2.save("outputs/last-ocr_image-processed.gif")
  84. inletter = False
  85. foundletter = False
  86. start = 0
  87. end = 0
  88. letters = []
  89. for y in range(im2.size[0]):
  90. for x in range(im2.size[1]):
  91. pix = im2.getpixel((y, x))
  92. if pix != 255:
  93. inletter = True
  94. if foundletter == False and inletter == True:
  95. foundletter = True
  96. start = y
  97. if foundletter == True and inletter == False:
  98. foundletter = False
  99. end = y
  100. letters.append((start, end))
  101. inletter = False
  102. count = 0
  103. for letter in letters:
  104. m = hashlib.md5()
  105. try:
  106. m.update(str(letter))
  107. except:
  108. m.update(str(letter).encode('utf-8'))
  109. im3 = im2.crop(( letter[0], 0, letter[1], im2.size[1] ))
  110. im3.save("outputs/words/%s.gif"%(m.hexdigest()))
  111. im3.save("core/images/previews/ocr/%s.gif"%(m.hexdigest()))
  112. count += 1
  113. print("[Info] Processing captcha/image with OCR algorithms. Please wait...\n")
  114. print("=================")
  115. print("Training Results:")
  116. print("=================")
  117. print("\n[Info] Number of 'symbols' found: "+ str(count)+"\n")
  118. if count == 0:
  119. print("\nOuch!. Looks like this captcha is resisting to our OCR methods... by the moment ;-)\n")
  120. print("Try this...\n")
  121. print(" 1) Check colour's ID values and quantity of pixels of each by using verbose")
  122. print(" 2) Set different ID values to your OCR configration and try it again")
  123. print(" 3) Try to apply some image filters (ex: B/W) manually with an editor (ex: GIMP) to your target")
  124. print(" 4) Maybe this module that you are using is not working for this captcha...\n")
  125. print("------------\n")
  126. else:
  127. path, dirs, files = next(os.walk("outputs/words/"))
  128. file_count = str(len(files))
  129. print("[Info] Generated [ "+ file_count+ " ] OCR images here: outputs/words/\n")
  130. if options.verbose:
  131. # checking for platform to list new words added to dictionary
  132. os_sys = platform.system()
  133. if os_sys == "Windows":
  134. subprocess.call("dir outputs/words/", shell=True)
  135. else:
  136. subprocess.call("ls outputs/words/", shell=True)
  137. print("")
  138. print("Now move each (human-recognized) OCR image to the correct folder on: dictionary/\n")
  139. if __name__ == "__main__":
  140. if sys.argv[1:]:
  141. ocr = CIntruderOCR(sys.argv[1:])
  142. print ("Data correctly extracted!")
  143. else:
  144. print ("You must set a captcha for learn. Ex: inputs/test1.gif")