123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153 |
- #!/usr/bin/python
- # -*- coding: iso-8859-15 -*-
- """
- This file is part of the cintruder project, http://cintruder.03c8.net
- Copyright (c) 2012/2016 psy <epsylon@riseup.net>
- cintruder is free software; you can redistribute it and/or modify it under
- the terms of the GNU General Public License as published by the Free
- Software Foundation version 3 of the License.
- cintruder is distributed in the hope that it will be useful, but WITHOUT ANY
- WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
- FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
- details.
- You should have received a copy of the GNU General Public License along
- with cintruder; if not, write to the Free Software Foundation, Inc., 51
- Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
- """
- from PIL import Image
- from operator import itemgetter
- import os, hashlib, time, sys, subprocess, platform
- import shutil
- class CIntruderOCR(object):
- """
- Class to apply OCR techniques into captchas (general algorithm)
- """
- def __init__(self, captcha, options):
- # generate words structure (+ previews for gui)
- if not os.path.exists("outputs/words/"):
- os.mkdir("outputs/words/")
- else:
- shutil.rmtree("outputs/words/")
- os.mkdir("outputs/words/")
- if not os.path.exists("core/images/previews/"):
- os.mkdir("core/images/previews/")
- else:
- shutil.rmtree("core/images/previews/")
- os.mkdir("core/images/previews/")
- if not os.path.exists("core/images/previews/ocr/"):
- os.mkdir("core/images/previews/ocr/")
- else:
- shutil.rmtree("core/images/previews/ocr/")
- os.mkdir("core/images/previews/ocr/")
- # initialize main CIntruder
- try:
- im = Image.open(captcha)
- im.save("core/images/previews/last-preview.gif")
- im2 = Image.new("P", im.size, 255)
- im = im.convert("P")
- except:
- print "Error during OCR process... Is that captcha supported?\n"
- return
- colourid = []
- try: # extract colour histogram
- hist = im.histogram()
- except:
- print "\n[Error] Something wrong extracting histogram. Aborting...\n"
- return
- values = {}
- for i in range(256):
- values[i] = hist[i]
- if options.verbose:
- print "\n[Info] Extracting advanced OCR info..."
- print "\n============================="
- print "Image Histogram (order by >):"
- print "============================="
- count = 0
- for j, k in sorted(values.items(), key=itemgetter(1), reverse=True)[:10]:
- colourid.append(j)
- if options.verbose:
- count = count + 1
- if count == 1: # first is background
- print "Colour ID: [", j, "] -> Total pixels:", k, "[Background]"
- else:
- print "Colour ID: [", j, "] -> Total pixels:", k
- if options.verbose:
- print ""
- temp = {}
- for x in range(im.size[1]):
- for y in range(im.size[0]):
- pix = im.getpixel((y, x))
- temp[pix] = pix
- if options.setids:
- colourid = int(options.setids)
- if pix == colourid:
- im2.putpixel((y, x), 0)
- else:
- if pix == colourid[1]: #id numbers of colours to get (*)
- im2.putpixel((y, x), 0)
- im2.save("outputs/last-ocr_image-processed.gif")
- inletter = False
- foundletter = False
- start = 0
- end = 0
- letters = []
- for y in range(im2.size[0]):
- for x in range(im2.size[1]):
- pix = im2.getpixel((y, x))
- if pix != 255:
- inletter = True
- if foundletter == False and inletter == True:
- foundletter = True
- start = y
- if foundletter == True and inletter == False:
- foundletter = False
- end = y
- letters.append((start, end))
- inletter = False
- count = 0
- for letter in letters:
- m = hashlib.md5()
- im3 = im2.crop(( letter[0], 0, letter[1], im2.size[1] ))
- m.update("%s%s"%(time.time(), count))
- im3.save("outputs/words/%s.gif"%(m.hexdigest()))
- im3.save("core/images/previews/ocr/%s.gif"%(m.hexdigest()))
- count += 1
- print "[Info] Processing captcha/image with OCR algorithms. Please wait...\n"
- print "================="
- print "Training Results:"
- print "================="
- print "[Info] Number of 'symbols' found: [", count, "]"
- if count == 0:
- print "\nOuch!. Looks like this captcha is resisting to our OCR methods... by the moment ;-)\n"
- print "Try this...\n"
- print " 1) Check colour's ID values and quantity of pixels of each by using verbose"
- print " 2) Set different ID values to your OCR configuration and try it again"
- print " 3) Try to apply some image filters (ex: B/W) manually with an editor (ex: GIMP) to your target"
- print " 4) Maybe there is a module that works correctly for this captcha...\n"
- print "------------\n"
- else:
- path, dirs, files = os.walk("outputs/words/").next()
- file_count = str(len(files))
- print "[Info] Generated [ "+ file_count+ " ] OCR images here:", "outputs/words/\n"
- if options.verbose:
- # checking for platform to list new words added to dictionary
- os_sys = platform.system()
- if os_sys == "Windows":
- subprocess.call("dir outputs/words/", shell=True)
- else:
- subprocess.call("ls outputs/words/", shell=True)
- print ""
- print "Now move each (human-recognized) OCR image to the correct folder on: dictionary/\n"
- if __name__ == "__main__":
- if sys.argv[1:]:
- ocr = CIntruderOCR(sys.argv[1:])
- print ("Data correctly extracted!")
- else:
- print ("You must set a captcha for learn. Ex: inputs/test1.gif")
|