epsylon
/
cintruder


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156
							#!/usr/bin/python3
# -*- coding: iso-8859-15 -*-
"""
This file is part of the cintruder project, https://cintruder.03c8.net

Copyright (c) 2012/2020 psy <epsylon@riseup.net>

cintruder is free software; you can redistribute it and/or modify it under
the terms of the GNU General Public License as published by the Free
Software Foundation version 3 of the License.

cintruder is distributed in the hope that it will be useful, but WITHOUT ANY
WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
details.

You should have received a copy of the GNU General Public License along
with cintruder; if not, write to the Free Software Foundation, Inc., 51
Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
"""
from PIL import Image
from operator import itemgetter
import os, hashlib, time, sys, subprocess, platform
import shutil

class CIntruderOCR(object):
    """
    Class to apply OCR techniques into captchas (general algorithm)
    """
    def __init__(self, captcha, options):
        # generate words structure (+ previews for gui)
        if not os.path.exists("outputs/words/"):
            os.mkdir("outputs/words/")
        else:
            shutil.rmtree("outputs/words/") 
            os.mkdir("outputs/words/")
        if not os.path.exists("core/images/previews/"):
            os.mkdir("core/images/previews/")
        else:
            shutil.rmtree("core/images/previews/")
            os.mkdir("core/images/previews/")
        if not os.path.exists("core/images/previews/ocr/"):
            os.mkdir("core/images/previews/ocr/")
        else:
            shutil.rmtree("core/images/previews/ocr/")
            os.mkdir("core/images/previews/ocr/")
        # initialize main CIntruder
        try:
            im = Image.open(captcha)
            im.save("core/images/previews/last-preview.gif")
            im2 = Image.new("P", im.size, 255)
            im = im.convert("P")
        except:
            print("[Error] Fail during OCR process... Is that captcha supported?\n")
            return
        colourid = []
        try: # extract colour histogram
            hist = im.histogram()
        except:
            print("[Error] Something wrong extracting histogram. Aborting...\n")
            return
        values = {}
        for i in range(256):
            values[i] = hist[i]
        if options.verbose:
            print("[Info] Extracting advanced OCR info...")
            print("\n=============================") 
            print("Image Histogram (order by >):")
            print("=============================")
        count = 0
        for j, k in sorted(list(values.items()), key=itemgetter(1), reverse=True)[:10]:
            colourid.append(j)  
            if options.verbose:
                count = count + 1
                if count == 1: # first is background
                    print("Colour ID: [ "+ str(j)+ " ] -> Total pixels: "+ str(k)+ " [Background]")
                else:
                    print("Colour ID: [ "+ str(j)+ " ] -> Total pixels: "+ str(k))
        if options.verbose:
            print("")
        temp = {}
        for x in range(im.size[1]):
            for y in range(im.size[0]):
                pix = im.getpixel((y, x))
                temp[pix] = pix
                if options.setids:
                    colourid = int(options.setids)
                    if pix == colourid:
                        im2.putpixel((y, x), 0)
                else:
                    if pix == colourid[1]: #id numbers of colours to get (*)
                        im2.putpixel((y, x), 0)
        im2.save("outputs/last-ocr_image-processed.gif")
        inletter = False
        foundletter = False
        start = 0
        end = 0
        letters = []
        for y in range(im2.size[0]): 
            for x in range(im2.size[1]): 
                pix = im2.getpixel((y, x))
                if pix != 255:
                    inletter = True
            if foundletter == False and inletter == True:
                foundletter = True
                start = y
            if foundletter == True and inletter == False:
                foundletter = False
                end = y
                letters.append((start, end))
            inletter = False
        count = 0
        for letter in letters:
            m = hashlib.md5()
            try:
                m.update(str(letter))
            except:
                m.update(str(letter).encode('utf-8'))
            im3 = im2.crop(( letter[0], 0, letter[1], im2.size[1] ))
            im3.save("outputs/words/%s.gif"%(m.hexdigest()))
            im3.save("core/images/previews/ocr/%s.gif"%(m.hexdigest()))
            count += 1
        print("[Info] Processing captcha/image with OCR algorithms. Please wait...\n")
        print("=================")
        print("Training Results:")
        print("=================")
        print("\n[Info] Number of 'symbols' found: "+ str(count)+"\n")
        if count == 0:
            print("\nOuch!. Looks like this captcha is resisting to our OCR methods... by the moment ;-)\n")
            print("Try this...\n") 
            print("    1) Check colour's ID values and quantity of pixels of each symbol using verbose") 
            print("    2) Set different ID values to your OCR configuration and try it again")
            print("    3) Try to apply some image filters (ex: B/W) manually with an editor (ex: GIMP) to your target")
            print("    4) Maybe there is a module that works correctly for this captcha...\n")
            print("------------\n")
        else:
            path, dirs, files = next(os.walk("outputs/words/"))
            file_count = str(len(files))
            print ("[Info] Generated [ "+ file_count+ " ] OCR images here: outputs/words/\n")
            if options.verbose:
                # checking for platform to list new words added to dictionary
                os_sys = platform.system()
                if os_sys == "Windows":
                    subprocess.call("dir outputs/words/", shell=True)
                else:
                    subprocess.call("ls outputs/words/", shell=True)
                print("")
            print("Now move each (human-recognized) OCR image to the correct folder on: dictionary/\n")

if __name__ == "__main__":
    if sys.argv[1:]:
        ocr = CIntruderOCR(sys.argv[1:])
        print ("Data correctly extracted!")
    else:
        print ("You must set a captcha for learn. Ex: inputs/test1.gif")