ocr.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156
  1. #!/usr/bin/python3
  2. # -*- coding: iso-8859-15 -*-
  3. """
  4. This file is part of the cintruder project, https://cintruder.03c8.net
  5. Copyright (c) 2012/2020 psy <epsylon@riseup.net>
  6. cintruder is free software; you can redistribute it and/or modify it under
  7. the terms of the GNU General Public License as published by the Free
  8. Software Foundation version 3 of the License.
  9. cintruder is distributed in the hope that it will be useful, but WITHOUT ANY
  10. WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  11. FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  12. details.
  13. You should have received a copy of the GNU General Public License along
  14. with cintruder; if not, write to the Free Software Foundation, Inc., 51
  15. Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  16. """
  17. from PIL import Image
  18. from operator import itemgetter
  19. import os, hashlib, time, sys, subprocess, platform
  20. import shutil
  21. class CIntruderOCR(object):
  22. """
  23. Class to apply OCR techniques into captchas (general algorithm)
  24. """
  25. def __init__(self, captcha, options):
  26. # generate words structure (+ previews for gui)
  27. if not os.path.exists("outputs/words/"):
  28. os.mkdir("outputs/words/")
  29. else:
  30. shutil.rmtree("outputs/words/")
  31. os.mkdir("outputs/words/")
  32. if not os.path.exists("core/images/previews/"):
  33. os.mkdir("core/images/previews/")
  34. else:
  35. shutil.rmtree("core/images/previews/")
  36. os.mkdir("core/images/previews/")
  37. if not os.path.exists("core/images/previews/ocr/"):
  38. os.mkdir("core/images/previews/ocr/")
  39. else:
  40. shutil.rmtree("core/images/previews/ocr/")
  41. os.mkdir("core/images/previews/ocr/")
  42. # initialize main CIntruder
  43. try:
  44. im = Image.open(captcha)
  45. im.save("core/images/previews/last-preview.gif")
  46. im2 = Image.new("P", im.size, 255)
  47. im = im.convert("P")
  48. except:
  49. print("[Error] Fail during OCR process... Is that captcha supported?\n")
  50. return
  51. colourid = []
  52. try: # extract colour histogram
  53. hist = im.histogram()
  54. except:
  55. print("[Error] Something wrong extracting histogram. Aborting...\n")
  56. return
  57. values = {}
  58. for i in range(256):
  59. values[i] = hist[i]
  60. if options.verbose:
  61. print("[Info] Extracting advanced OCR info...")
  62. print("\n=============================")
  63. print("Image Histogram (order by >):")
  64. print("=============================")
  65. count = 0
  66. for j, k in sorted(list(values.items()), key=itemgetter(1), reverse=True)[:10]:
  67. colourid.append(j)
  68. if options.verbose:
  69. count = count + 1
  70. if count == 1: # first is background
  71. print("Colour ID: [ "+ str(j)+ " ] -> Total pixels: "+ str(k)+ " [Background]")
  72. else:
  73. print("Colour ID: [ "+ str(j)+ " ] -> Total pixels: "+ str(k))
  74. if options.verbose:
  75. print("")
  76. temp = {}
  77. for x in range(im.size[1]):
  78. for y in range(im.size[0]):
  79. pix = im.getpixel((y, x))
  80. temp[pix] = pix
  81. if options.setids:
  82. colourid = int(options.setids)
  83. if pix == colourid:
  84. im2.putpixel((y, x), 0)
  85. else:
  86. if pix == colourid[1]: #id numbers of colours to get (*)
  87. im2.putpixel((y, x), 0)
  88. im2.save("outputs/last-ocr_image-processed.gif")
  89. inletter = False
  90. foundletter = False
  91. start = 0
  92. end = 0
  93. letters = []
  94. for y in range(im2.size[0]):
  95. for x in range(im2.size[1]):
  96. pix = im2.getpixel((y, x))
  97. if pix != 255:
  98. inletter = True
  99. if foundletter == False and inletter == True:
  100. foundletter = True
  101. start = y
  102. if foundletter == True and inletter == False:
  103. foundletter = False
  104. end = y
  105. letters.append((start, end))
  106. inletter = False
  107. count = 0
  108. for letter in letters:
  109. m = hashlib.md5()
  110. try:
  111. m.update(str(letter))
  112. except:
  113. m.update(str(letter).encode('utf-8'))
  114. im3 = im2.crop(( letter[0], 0, letter[1], im2.size[1] ))
  115. im3.save("outputs/words/%s.gif"%(m.hexdigest()))
  116. im3.save("core/images/previews/ocr/%s.gif"%(m.hexdigest()))
  117. count += 1
  118. print("[Info] Processing captcha/image with OCR algorithms. Please wait...\n")
  119. print("=================")
  120. print("Training Results:")
  121. print("=================")
  122. print("\n[Info] Number of 'symbols' found: "+ str(count)+"\n")
  123. if count == 0:
  124. print("\nOuch!. Looks like this captcha is resisting to our OCR methods... by the moment ;-)\n")
  125. print("Try this...\n")
  126. print(" 1) Check colour's ID values and quantity of pixels of each symbol using verbose")
  127. print(" 2) Set different ID values to your OCR configuration and try it again")
  128. print(" 3) Try to apply some image filters (ex: B/W) manually with an editor (ex: GIMP) to your target")
  129. print(" 4) Maybe there is a module that works correctly for this captcha...\n")
  130. print("------------\n")
  131. else:
  132. path, dirs, files = next(os.walk("outputs/words/"))
  133. file_count = str(len(files))
  134. print ("[Info] Generated [ "+ file_count+ " ] OCR images here: outputs/words/\n")
  135. if options.verbose:
  136. # checking for platform to list new words added to dictionary
  137. os_sys = platform.system()
  138. if os_sys == "Windows":
  139. subprocess.call("dir outputs/words/", shell=True)
  140. else:
  141. subprocess.call("ls outputs/words/", shell=True)
  142. print("")
  143. print("Now move each (human-recognized) OCR image to the correct folder on: dictionary/\n")
  144. if __name__ == "__main__":
  145. if sys.argv[1:]:
  146. ocr = CIntruderOCR(sys.argv[1:])
  147. print ("Data correctly extracted!")
  148. else:
  149. print ("You must set a captcha for learn. Ex: inputs/test1.gif")