ocr.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153
  1. #!/usr/bin/python
  2. # -*- coding: iso-8859-15 -*-
  3. """
  4. This file is part of the cintruder project, http://cintruder.03c8.net
  5. Copyright (c) 2012/2016 psy <epsylon@riseup.net>
  6. cintruder is free software; you can redistribute it and/or modify it under
  7. the terms of the GNU General Public License as published by the Free
  8. Software Foundation version 3 of the License.
  9. cintruder is distributed in the hope that it will be useful, but WITHOUT ANY
  10. WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  11. FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  12. details.
  13. You should have received a copy of the GNU General Public License along
  14. with cintruder; if not, write to the Free Software Foundation, Inc., 51
  15. Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  16. """
  17. from PIL import Image
  18. from operator import itemgetter
  19. import os, hashlib, time, sys, subprocess, platform
  20. import shutil
  21. class CIntruderOCR(object):
  22. """
  23. Class to apply OCR techniques into captchas (general algorithm)
  24. """
  25. def __init__(self, captcha, options):
  26. # generate words structure (+ previews for gui)
  27. if not os.path.exists("outputs/words/"):
  28. os.mkdir("outputs/words/")
  29. else:
  30. shutil.rmtree("outputs/words/")
  31. os.mkdir("outputs/words/")
  32. if not os.path.exists("core/images/previews/"):
  33. os.mkdir("core/images/previews/")
  34. else:
  35. shutil.rmtree("core/images/previews/")
  36. os.mkdir("core/images/previews/")
  37. if not os.path.exists("core/images/previews/ocr/"):
  38. os.mkdir("core/images/previews/ocr/")
  39. else:
  40. shutil.rmtree("core/images/previews/ocr/")
  41. os.mkdir("core/images/previews/ocr/")
  42. # initialize main CIntruder
  43. try:
  44. im = Image.open(captcha)
  45. im.save("core/images/previews/last-preview.gif")
  46. im2 = Image.new("P", im.size, 255)
  47. im = im.convert("P")
  48. except:
  49. print "Error during OCR process... Is that captcha supported?\n"
  50. return
  51. colourid = []
  52. try: # extract colour histogram
  53. hist = im.histogram()
  54. except:
  55. print "\n[Error] Something wrong extracting histogram. Aborting...\n"
  56. return
  57. values = {}
  58. for i in range(256):
  59. values[i] = hist[i]
  60. if options.verbose:
  61. print "\n[Info] Extracting advanced OCR info..."
  62. print "\n============================="
  63. print "Image Histogram (order by >):"
  64. print "============================="
  65. count = 0
  66. for j, k in sorted(values.items(), key=itemgetter(1), reverse=True)[:10]:
  67. colourid.append(j)
  68. if options.verbose:
  69. count = count + 1
  70. if count == 1: # first is background
  71. print "Colour ID: [", j, "] -> Total pixels:", k, "[Background]"
  72. else:
  73. print "Colour ID: [", j, "] -> Total pixels:", k
  74. if options.verbose:
  75. print ""
  76. temp = {}
  77. for x in range(im.size[1]):
  78. for y in range(im.size[0]):
  79. pix = im.getpixel((y, x))
  80. temp[pix] = pix
  81. if options.setids:
  82. colourid = int(options.setids)
  83. if pix == colourid:
  84. im2.putpixel((y, x), 0)
  85. else:
  86. if pix == colourid[1]: #id numbers of colours to get (*)
  87. im2.putpixel((y, x), 0)
  88. im2.save("outputs/last-ocr_image-processed.gif")
  89. inletter = False
  90. foundletter = False
  91. start = 0
  92. end = 0
  93. letters = []
  94. for y in range(im2.size[0]):
  95. for x in range(im2.size[1]):
  96. pix = im2.getpixel((y, x))
  97. if pix != 255:
  98. inletter = True
  99. if foundletter == False and inletter == True:
  100. foundletter = True
  101. start = y
  102. if foundletter == True and inletter == False:
  103. foundletter = False
  104. end = y
  105. letters.append((start, end))
  106. inletter = False
  107. count = 0
  108. for letter in letters:
  109. m = hashlib.md5()
  110. im3 = im2.crop(( letter[0], 0, letter[1], im2.size[1] ))
  111. m.update("%s%s"%(time.time(), count))
  112. im3.save("outputs/words/%s.gif"%(m.hexdigest()))
  113. im3.save("core/images/previews/ocr/%s.gif"%(m.hexdigest()))
  114. count += 1
  115. print "[Info] Processing captcha/image with OCR algorithms. Please wait...\n"
  116. print "================="
  117. print "Training Results:"
  118. print "================="
  119. print "[Info] Number of 'symbols' found: [", count, "]"
  120. if count == 0:
  121. print "\nOuch!. Looks like this captcha is resisting to our OCR methods... by the moment ;-)\n"
  122. print "Try this...\n"
  123. print " 1) Check colour's ID values and quantity of pixels of each by using verbose"
  124. print " 2) Set different ID values to your OCR configuration and try it again"
  125. print " 3) Try to apply some image filters (ex: B/W) manually with an editor (ex: GIMP) to your target"
  126. print " 4) Maybe there is a module that works correctly for this captcha...\n"
  127. print "------------\n"
  128. else:
  129. path, dirs, files = os.walk("outputs/words/").next()
  130. file_count = str(len(files))
  131. print "[Info] Generated [ "+ file_count+ " ] OCR images here:", "outputs/words/\n"
  132. if options.verbose:
  133. # checking for platform to list new words added to dictionary
  134. os_sys = platform.system()
  135. if os_sys == "Windows":
  136. subprocess.call("dir outputs/words/", shell=True)
  137. else:
  138. subprocess.call("ls outputs/words/", shell=True)
  139. print ""
  140. print "Now move each (human-recognized) OCR image to the correct folder on: dictionary/\n"
  141. if __name__ == "__main__":
  142. if sys.argv[1:]:
  143. ocr = CIntruderOCR(sys.argv[1:])
  144. print ("Data correctly extracted!")
  145. else:
  146. print ("You must set a captcha for learn. Ex: inputs/test1.gif")