epsylon
/
cintruder


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153
							#!/usr/bin/python
# -*- coding: iso-8859-15 -*-
"""
This file is part of the cintruder project, http://cintruder.03c8.net

Copyright (c) 2012/2016 psy <epsylon@riseup.net>

cintruder is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation version 3 of the License.

cintruder is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
details.

You should have received a copy of the GNU General Public License along
with cintruder; if not, write to the Free Software Foundation, Inc., 51
Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
"""
from PIL import Image
from operator import itemgetter
import os, hashlib, time, sys, subprocess, platform
import shutil

class CIntruderOCR(object):
    """
    Class to apply OCR techniques into captchas (general algorithm)
    """
    def __init__(self, captcha, options):
        # generate words structure (+ previews for gui)
        if not os.path.exists("outputs/words/"):
            os.mkdir("outputs/words/")
        else:
            shutil.rmtree("outputs/words/") 
            os.mkdir("outputs/words/")
        if not os.path.exists("core/images/previews/"):
            os.mkdir("core/images/previews/")
        else:
            shutil.rmtree("core/images/previews/")
            os.mkdir("core/images/previews/")
        if not os.path.exists("core/images/previews/ocr/"):
            os.mkdir("core/images/previews/ocr/")
        else:
            shutil.rmtree("core/images/previews/ocr/")
            os.mkdir("core/images/previews/ocr/")
        # initialize main CIntruder
        try:
            im = Image.open(captcha)
            im.save("core/images/previews/last-preview.gif")
            im2 = Image.new("P", im.size, 255)
            im = im.convert("P")
        except:
            print "Error during OCR process... Is that captcha supported?\n"
            return
        colourid = []
        try: # extract colour histogram
            hist = im.histogram()
        except:
            print "\n[Error] Something wrong extracting histogram. Aborting...\n"
            return
        values = {}
        for i in range(256):
            values[i] = hist[i]
        if options.verbose:
            print "\n[Info] Extracting advanced OCR info..."
            print "\n=============================" 
            print "Image Histogram (order by >):"
            print "============================="
        count = 0
        for j, k in sorted(values.items(), key=itemgetter(1), reverse=True)[:10]:
            colourid.append(j)  
            if options.verbose:
                count = count + 1
                if count == 1: # first is background
                    print "Colour ID: [", j, "] -> Total pixels:", k, "[Background]"
                else:
                    print "Colour ID: [", j, "] -> Total pixels:", k
        if options.verbose:
            print ""
        temp = {}
        for x in range(im.size[1]):
            for y in range(im.size[0]):
                pix = im.getpixel((y, x))
                temp[pix] = pix
                if options.setids:
                    colourid = int(options.setids)
                    if pix == colourid:
                        im2.putpixel((y, x), 0)
                else:
                    if pix == colourid[1]: #id numbers of colours to get (*)
                        im2.putpixel((y, x), 0)
        im2.save("outputs/last-ocr_image-processed.gif")
        inletter = False
        foundletter = False
        start = 0
        end = 0
        letters = []
        for y in range(im2.size[0]): 
            for x in range(im2.size[1]): 
                pix = im2.getpixel((y, x))
                if pix != 255:
                    inletter = True
            if foundletter == False and inletter == True:
                foundletter = True
                start = y
            if foundletter == True and inletter == False:
                foundletter = False
                end = y
                letters.append((start, end))
            inletter = False
        count = 0
        for letter in letters:
            m = hashlib.md5()
            im3 = im2.crop(( letter[0], 0, letter[1], im2.size[1] ))
            m.update("%s%s"%(time.time(), count))
            im3.save("outputs/words/%s.gif"%(m.hexdigest()))
            im3.save("core/images/previews/ocr/%s.gif"%(m.hexdigest()))
            count += 1
        print "[Info] Processing captcha/image with OCR algorithms. Please wait...\n"
        print "================="
        print "Training Results:"
        print "================="
        print "[Info] Number of 'symbols' found: [", count, "]"
        if count == 0:
            print "\nOuch!. Looks like this captcha is resisting to our OCR methods... by the moment ;-)\n"
            print "Try this...\n" 
            print "    1) Check colour's ID values and quantity of pixels of each by using verbose" 
            print "    2) Set different ID values to your OCR configuration and try it again"
            print "    3) Try to apply some image filters (ex: B/W) manually with an editor (ex: GIMP) to your target"
            print "    4) Maybe there is a module that works correctly for this captcha...\n"
            print "------------\n"
        else:
            path, dirs, files = os.walk("outputs/words/").next()
            file_count = str(len(files))
            print "[Info] Generated [ "+ file_count+ " ] OCR images here:", "outputs/words/\n"
            if options.verbose:
                # checking for platform to list new words added to dictionary
                os_sys = platform.system()
                if os_sys == "Windows":
                    subprocess.call("dir outputs/words/", shell=True)
                else:
                    subprocess.call("ls outputs/words/", shell=True)
                print ""
            print "Now move each (human-recognized) OCR image to the correct folder on: dictionary/\n"

if __name__ == "__main__":
    if sys.argv[1:]:
        ocr = CIntruderOCR(sys.argv[1:])
        print ("Data correctly extracted!")
    else:
        print ("You must set a captcha for learn. Ex: inputs/test1.gif")