easy_ocr.py 6.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149
  1. #!/usr/bin/python
  2. # -*- coding: iso-8859-15 -*-
  3. """
  4. This file is part of the cintruder project, http://cintruder.03c8.net
  5. Copyright (c) 2012/2016 psy <epsylon@riseup.net>
  6. cintruder is free software; you can redistribute it and/or modify it under
  7. the terms of the GNU General Public License as published by the Free
  8. Software Foundation version 3 of the License.
  9. cintruder is distributed in the hope that it will be useful, but WITHOUT ANY
  10. WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
  11. FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
  12. details.
  13. You should have received a copy of the GNU General Public License along
  14. with cintruder; if not, write to the Free Software Foundation, Inc., 51
  15. Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  16. """
  17. from PIL import Image
  18. from operator import itemgetter
  19. import os, hashlib, time, sys, subprocess, platform
  20. import shutil
  21. class CIntruderOCR(object):
  22. """
  23. Class to apply OCR techniques to EasyCaptcha (http://kestas.kuliukas.com/EasyCaptcha/EasyCaptcha/easycaptcha.php)
  24. """
  25. def __init__(self, captcha, options):
  26. # generate words structure (+ previews for gui)
  27. if not os.path.exists("outputs/words/"):
  28. os.mkdir("outputs/words/")
  29. else:
  30. shutil.rmtree("outputs/words/")
  31. os.mkdir("outputs/words/")
  32. if not os.path.exists("core/images/previews/"):
  33. os.mkdir("core/images/previews/")
  34. else:
  35. shutil.rmtree("core/images/previews/")
  36. os.mkdir("core/images/previews/")
  37. if not os.path.exists("core/images/previews/ocr/"):
  38. os.mkdir("core/images/previews/ocr/")
  39. else:
  40. shutil.rmtree("core/images/previews/ocr/")
  41. os.mkdir("core/images/previews/ocr/")
  42. # initialize main CIntruder
  43. try:
  44. im = Image.open(captcha)
  45. im.save("core/images/previews/last-preview.gif")
  46. im2 = Image.new("P", im.size, 255)
  47. im = im.convert("P")
  48. except:
  49. print "Error during OCR process!. Is that captcha supported?\n"
  50. return
  51. colourid = []
  52. try: # extract colour histogram
  53. hist = im.histogram()
  54. except:
  55. print "\n[Error] Something wrong extracting histogram. Aborting...\n"
  56. return
  57. values = {}
  58. for i in range(256):
  59. values[i] = hist[i]
  60. if options.verbose:
  61. print "[Info] Extracting advanced OCR info..."
  62. print "\n============================="
  63. print "Image Histogram (order by >):"
  64. print "============================="
  65. for j, k in sorted(values.items(), key=itemgetter(1), reverse=True)[:10]:
  66. colourid.append(j)
  67. if options.verbose:
  68. print "Colour ID: [", j, "] -> Total pixels:", k
  69. if options.verbose:
  70. print ""
  71. temp = {}
  72. for x in range(im.size[1]):
  73. for y in range(im.size[0]):
  74. pix = im.getpixel((y, x))
  75. temp[pix] = pix
  76. if options.setids:
  77. colourid = int(options.setids)
  78. if pix == colourid:
  79. im2.putpixel((y, x), 0)
  80. else:
  81. if pix == colourid[1]: #id numbers of colours to get (*)
  82. im2.putpixel((y, x), 0)
  83. im2.save("outputs/last-ocr_image-processed.gif")
  84. inletter = False
  85. foundletter = False
  86. start = 0
  87. end = 0
  88. letters = []
  89. for y in range(im2.size[0]):
  90. for x in range(im2.size[1]):
  91. pix = im2.getpixel((y, x))
  92. if pix != 255:
  93. inletter = True
  94. if foundletter == False and inletter == True:
  95. foundletter = True
  96. start = y
  97. if foundletter == True and inletter == False:
  98. foundletter = False
  99. end = y
  100. letters.append((start, end))
  101. inletter = False
  102. count = 0
  103. for letter in letters:
  104. m = hashlib.md5()
  105. im3 = im2.crop(( letter[0], 0, letter[1], im2.size[1] ))
  106. m.update("%s%s"%(time.time(), count))
  107. im3.save("outputs/words/%s.gif"%(m.hexdigest()))
  108. im3.save("core/images/previews/ocr/%s.gif"%(m.hexdigest()))
  109. count += 1
  110. print "[Info] Processing captcha/image with OCR algorithms. Please wait...\n"
  111. print "================="
  112. print "Training Results:"
  113. print "================="
  114. print "\nNumber of 'symbols' found: [", count, "]"
  115. if count == 0:
  116. print "\nOuch!. Looks like this captcha is resisting to our OCR methods... by the moment ;-)\n"
  117. print "Try this...\n"
  118. print " 1) Check colour's ID values and quantity of pixels of each by using verbose"
  119. print " 2) Set different ID values to your OCR configration and try it again"
  120. print " 3) Try to apply some image filters (ex: B/W) manually with an editor (ex: GIMP) to your target"
  121. print " 4) Maybe this module that you are using is not working for this captcha...\n"
  122. print "------------\n"
  123. else:
  124. path, dirs, files = os.walk("outputs/words/").next()
  125. file_count = str(len(files))
  126. print "\n[Info] Generated [ "+ file_count+ " ] OCR images here:", "outputs/words/\n"
  127. if options.verbose:
  128. # checking for platform to list new words added to dictionary
  129. os_sys = platform.system()
  130. if os_sys == "Windows":
  131. subprocess.call("dir outputs/words/", shell=True)
  132. else:
  133. subprocess.call("ls outputs/words/", shell=True)
  134. print ""
  135. print "Now move each (human-recognized) OCR image to the correct folder on: dictionary/\n"
  136. if __name__ == "__main__":
  137. if sys.argv[1:]:
  138. ocr = CIntruderOCR(sys.argv[1:])
  139. print ("Data correctly extracted!")
  140. else:
  141. print ("You must set a captcha for learn. Ex: inputs/test1.gif")