epsylon
/
propagare


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929
							#!/usr/bin/env python 
# -*- coding: utf-8 -*-"
"""
Propagare - 2018 - by psy (epsylon@riseup.net)

-------
You should have received a copy of the GNU General Public License along
with Propagare; if not, write to the Free Software Foundation, Inc., 51
Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
-------

"A diferencia del odio, el amor se expande sin necesidad de propaganda."

"""
import os, sys, urllib2, ssl, random, re, time, string
import json, operator, io
try:
    import html2text
except:
    print "\n[Error] - No se encuentra la librería: html2text. \n\n Puedes instalarla ejecutando en la terminal:\n\n   sudo apt-get install python-html2text\n\n Después, prueba a lanzar de nuevo la aplicación.\n"
    sys.exit(2)
from core.options import PropagareOptions
from core.update import Updater
reload(sys)
sys.setdefaultencoding('utf-8') # for decode byte (black magic!)

DEBUG = False

class Propagare(object):
    def __init__(self):
        self.supported_media = ["elpais.com", "eldiario.es", "elmundo.es"] # media sources modules currently implemented
        self.check_verb_online = "https://www.esfacil.eu/es/verbos/conjugacion.html?infinitive=" # for check spanish verbs online
        self.sources = [] # used for news media sources
        self.agents_file = 'core/txt/user-agents.txt' # set source path to retrieve user-agents
        self.referer = 'http://127.0.0.1/' # set referer
        self.agents = []
        f = open(self.agents_file)
        agents = f.readlines()
        f.close()
        for agent in agents:
            self.agents.append(agent) # agents are random each request
        self.ctx = ssl.create_default_context() # creating context to bypass SSL cert validation (black magic!)
        self.ctx.check_hostname = False
        self.ctx.verify_mode = ssl.CERT_NONE
        self.jdata = [] # json stream container (a list)
        self.verbs = [] # a list to store semantics (verbs)
        self.total_verbs = 0
        self.total_num = 0

    def set_options(self, options):
        self.options = options

    def create_options(self, args=None):
        self.optionParser = PropagareOptions()
        self.options = self.optionParser.get_options(args)
        if not self.options:
            return False
        return self.options

    def remove_punctuation(self, news):
        p = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
        no_p = ""
        for c in news:
            if c not in p:
                no_p = no_p + c
        news = no_p           
        return news

    def update_progress(self, t, p, l):
        b = p*100/l
        msg = "\r{0}: [{1}] {2}%".format(t, "#"*p + "-"*(l-p), round(b,2))
        if p >= 100: msg += " DONE\r\n"
        sys.stdout.write(msg)
        sys.stdout.flush()

    def banner(self):
        print '='*75
        print "  ____                                              _       "
        print " |  _ \ Si no 'contesta'...(es)__ _  __ _ _ __   __| | __ _ "
        print " | |_) | '__/ _ \| '_ \ / _` |/ _` |/ _` | '_ \ / _` |/ _` |"
        print " |  __/| | | (_) | |_) | (_| | (_| | (_| | | | | (_| | (_| |"
        print " |_|   |_|  \___/| .__/ \__,_|\__, |\__,_|_| |_|\__,_|\__,_|"
        print "                 |_|          |___/                (IA)v:0.1"
        print "\n"+'='*75
        print "\n"+self.optionParser.description
        print "\n"+'='*75

    @classmethod
    def try_running(cls, func, error, args=None):
        args = args or []
        try:
            return func(*args)
        except Exception:
            print(error, "error")
            if DEBUG == True:
                traceback.print_exc()

    def generate_json(self, n, category, date, tag, ID):
        if "elpais.com" in n:
            self.json_report = open('data/'+n+"/"+category+"/"+date+"/"+tag+"/"+ID+"/"+ID+".json", "w") 
        if "eldiario.es" in n:
            self.json_report = open('data/'+n+"/"+category+"/"+ID+"/"+ID+".json", "w")
        if "elmundo.es" in n:
            if tag is None:
                self.json_report = open('data/'+n+"/"+category+"/"+date+"/"+ID+".json", "w")
            else:
                self.json_report = open('data/'+n+"/"+category+"/"+date+"/"+tag+"/"+ID+".json", "w")

    def format_content(self, data):
        html_parser = html2text.HTML2Text()
        html_parser.ignore_links = True
        html_parser.ignore_images = True
        html_parser.ignore_emphasis = True
        html_parser.bypass_tables = True 
        html_parser.unicode_snob = True
        html_parser.skip_internal_links = True
        parsed = html_parser.handle(data)
        parsed = parsed.replace("\n", " ") # clean \n
        parsed = parsed.replace('\"', " ") # clean \"
        parsed = parsed.replace("#","") # clean #
        return parsed

    def count_all_stored(self): # all sources
        art_stored = 0
        for root, dirs, files in os.walk('data/'):
            art_stored += len(files) 
        return (art_stored-1)/2 # ((txt+json)-last_log)/2

    def count_art_stored(self, n): # by source
        art_stored = 0
        for root, dirs, files in os.walk('data/' + n):
            art_stored += len(files)
        return art_stored/2 # (txt+json)/2

    def count_sources_list(self):
        media_stored = 0
        for root, dirs, files in os.walk("data/"):
            for d in dirs:
                if d in self.supported_media:
                    media_stored = media_stored + 1
        return media_stored

    def check_art_exists(self, a, n):
        sep = ".json"
        for root, dirs, files in os.walk('data/' + n):
            for f in files:
                if f.endswith(".json"):
                    f = str(f.split(sep, 1)[0])
                    if str(f) in str(a):
                        check_pass = False
                        return check_pass
                    else:
                        check_pass = True

    def create_sources_list(self):
        for root, dirs, files in os.walk("sources/", topdown=False):
            for name in files:
                if name not in self.sources and name in self.supported_media:
                    self.sources.append(name) # add name to sources list     

    def check_art_repetitions(self, n, art_url_found):
        sep = "/"
        sep2 = ".html"
        sep3 = "."
        sep4 = "_"
        filenames = []
        for root, dirs, files in os.walk('data/' + n):
            for f in files:
                filename = os.path.basename(f)
                filename = str(filename.split(sep3, 1)[0])
                if not filename in filenames:
                    filenames.append(filename)
        if filenames:
            for f in filenames:
                for a in art_url_found:
                    if "eldiario.es" in n:
                        if str(f) in str(a):
                            art_url_found.remove(a)
                    if "elpais.com" in n:
                        f = str(f.split(sep3, 1)[0])
                        ID = str(a.split(sep, 5)[5]) 
                        ID = str(ID.split(sep2, 1)[0])
                        if sep in ID:
                            ID = str(ID.split(sep, 2)[2])
                        if str(ID) in str(f): # art stored, discard it
                            art_url_found.remove(a)
        return art_url_found

    def is_a_verb(self, w):
        if w.endswith("ar") or w.endswith("er") or w.endswith("ir"): # (spanish: inifitive verb)
             self.total_verbs = self.total_verbs + 1
             self.verbs.append(w) # add verb to list 

    def check_verb(self, verb):
        check_verb_online = str(self.check_verb_online) + verb # url + verb
        self.user_agent = random.choice(self.agents).strip() # suffle user-agent
        headers = {'User-Agent' : self.user_agent, 'Referer' : self.referer} # set fake user-agent and referer
        try:
            reply = urllib2.urlopen(check_verb_online, context=self.ctx).read()
        except: 
            print('\n[Error] - Imposible conectar con: ') + check_verb_online + '\n'
            return False
        if "¡Verbo no válido!" in reply or "¡Verbo no encontrado!" in reply: # working at: 30/04/2018
            return False
        else:
            return True

    def generate_data_stream(self):
        if self.options.ssource:
            if self.options.ssource in self.supported_media: # source is supported
                source = 'data/' + self.options.ssource
            else:
                print "-"*25
                print('\n[Error] La fuente indicada de estadísticas no está soportada! \n')
                print "-"*25
                print("\n[Info] Listado de fuentes soportadas :\n")
                n = 0
                for m in self.supported_media:
                    n = n + 1
                    print "    + ["+str(n)+"]:", m
                print "" # zen out
                sys.exit(2)
        else:
            source = 'data/'
        if self.options.tsource:
            if self.options.tsource in self.supported_media: # source is supported
                source = 'data/' + self.options.tsource
            else:
                print('\n[Error] La fuente indicada de búsqueda no está soportada! \n')
                print "-"*25
                print("\n[Info] Listado de fuentes soportadas :\n")
                n = 0
                for m in self.supported_media:
                    n = n + 1
                    print "    + ["+str(n)+"]:", m
                print "" # zen out
                sys.exit(2)
        else:
            if not self.options.ssource:
                source = 'data/'
        for root, dirs, files in os.walk(source): # generate stream for analisis             
            for fl in files:
                if fl.endswith(".json"): # extract content from json archives
                    p=os.path.join(root,fl)
                    kf = io.open(os.path.abspath(p), encoding='utf-8') 
                    try:
                        data = str(kf.read().encode('utf-8'))
                    except:
                        data = kf.read()
                    try:
                        self.jdata.append(json.loads(data))
                    except:
                        pass
                    kf.close()
        if not self.jdata:
            print "\n[Info] Necesitas extraer (-e) antes los datos, desde las fuentes.\n"
            print "[Info] Tienes el almacén vacío. Saliendo...\n"
            sys.exit(2) # return
        self.body_news_stream = []
        self.dates_news_stream = []
        self.author_news_stream = []
        self.entry_news_stream = []
        self.title_news_stream = []
        #self.url_news_stream = []
        for record in self.jdata:
            for key, value in record.iteritems(): # unpack a 'huge-stream' dict, stored on a list (a tuple) using iteritems() (black magic!)
                if key == "Noticia": # parse only for body content
                    self.body_news_stream.append(value)
                if key == "Fecha de publicación": # parse only for dates
                    self.dates_news_stream.append(value)
                if key == "Autor(a)": # parse only for authors
                    self.author_news_stream.append(value)
                if self.options.term: # extract more keys when searching a term
                    if key == "Entrada": # parse only for entry content
                        self.entry_news_stream.append(value)
                    if key == "Titular": # parse only for title
                        self.title_news_stream.append(value)
                    #if key == "Fuente": # parse only for url source
                    #    self.url_news_stream.append(value)

    def stats(self):
        print "\n[Info] Recopilando estadísticas del almacén...\n"
        all_art_stored = self.count_all_stored()
        if all_art_stored == 0 or all_art_stored < 0:
            print '-'*25
            print "\n[Info] Necesitas extraer (-e) antes los datos, desde las fuentes.\n" 
            print "[Info] Tienes el almacén vacío. Saliendo...\n" 
            return
        else:
            print "-"*25
            json_stats = open('data/last_stats.json', "w") # generate json with last stats
            json_stats_data = {}
            self.generate_data_stream() # generate a 'buffer' stream with records (using json files)
            self.create_sources_list()
            media_sources = self.count_sources_list()
            print "\n [+] Total medios:", str(media_sources)
            print " [+] Total noticias:", str(all_art_stored) + "\n"
            json_stats_data.update({"Total medios": str(media_sources)})
            json_stats_data.update({"Total noticias": str(all_art_stored)})        
            news_letters = 0
            symbols = []
            letters_dict = {}
            words_dict = {}
            words_3_dict = {}
            words_4_dict = {}
            words_5_dict = {}
            words_6_dict = {}
            words_7_dict = {}
            words_8_dict = {}
            verbs_dict = {}
            authors_dict = {}
            letters_counter = 0
            words_counter = 0
            verbs_counter = 0
            authors_counter = 0
            for news in self.body_news_stream:
                news_parsed = self.remove_punctuation(str(news)) # remove punctuation signs / encode from unicode to str
                news_parsed_noblank = news_parsed
                news_parsed_noblank.replace(" ","") # data as a stream without blank spaces
                news_parsed_noblank = news_parsed_noblank.lower() # change stream to lowercase
                news_letters = news_letters + len(news_parsed_noblank)
                news_split = news_parsed.split()
                nums = "0123456789" # 0-9
                symbols = map(chr, range(97, 123)) # a-z (A-Z)
                for char in nums:
                    symbols.append(char)
                for l in symbols:
                    if l in news_parsed_noblank: # only count those letters that exists
                        letters_counter = int(letters_counter + news_parsed_noblank.count(l))
                        if l in letters_dict.iteritems():
                            g = int(letters_dict[l]) # extract previous value
                        else:
                            g = 0
                        lg = letters_counter + g # sum new letters counted to previous ones
                        letters_dict.update({l:lg}) # update dict with new value
                        letters_counter = 0 # flush counter
                for w in news_split:
                    w = w.lower() # change word to lowercase
                    if w in words_dict:
			words_counter = words_counter + 1  
                        g = int(words_dict[w]) 
                    else:
                        g = 1 
                    words_counter = words_counter + g
                    words_dict.update({w:words_counter}) 
                    words_counter = 0
                    if self.options.checkverbs: 
                        self.is_a_verb(w) # check for verbs (adding to a list) using semantic rules        
            for key, value in words_dict.iteritems():
                if len(key) == 3:
                    words_3_dict[key] = value
                if len(key) == 4:
                    words_4_dict[key] = value
                if len(key) == 5:
                    words_5_dict[key] = value
                if len(key) == 6:
                    words_6_dict[key] = value
                if len(key) == 7:
                    words_7_dict[key] = value
                if len(key) > 7:
                    words_8_dict[key] = value
            if self.options.ssource:
                print '-'*25
                print "\n[Info] Mostrando estadísticas para:", str(self.options.ssource) + "\n"
            print " [+] Noticia más antigua:",str(min(self.dates_news_stream))
            print " [+] Noticia más reciente:",str(max(self.dates_news_stream)) + "\n"
            json_stats_data.update({"Noticia más antigua": str(min(self.dates_news_stream))})
            json_stats_data.update({"Noticia más reciente": str(max(self.dates_news_stream))})
            print " [+] Total símbolos (a-Z/0-9):", str(news_letters) + " (diferentes: "+ str(len(letters_dict)) + ")"  
            json_stats_data.update({"Total símbolos (a-Z/0-9)": str(news_letters) + " (diferentes: "+ str(len(letters_dict)) + ")"  }) 
            if max(letters_dict.iteritems(), key=operator.itemgetter(1))[1] < 2:
                lw_max = "vez"
            else:
                lw_max = "veces"
            print " [+] Símbolo más utilizado: '"+str(max(letters_dict.iteritems(), key=operator.itemgetter(1))[0]).encode('utf-8').strip() + "' ("+ str(max(letters_dict.iteritems(), key=operator.itemgetter(1))[1])+ " " + lw_max + ")"
            json_stats_data.update({"Símbolo más utilizado": "'"+str(max(letters_dict.iteritems(), key=operator.itemgetter(1))[0])+ "' ("+ str(max(letters_dict.iteritems(), key=operator.itemgetter(1))[1])+ " " + lw_max + ")"}) 
            if min(letters_dict.iteritems(), key=operator.itemgetter(1))[1] < 2:
                lw_min = "vez"
            else:
                lw_min = "veces"
            print " [+] Símbolo menos repetido: '"+ str(min(letters_dict.iteritems(), key=operator.itemgetter(1))[0]).encode('utf-8').strip() + "' ("+ str(min(letters_dict.iteritems(), key=operator.itemgetter(1))[1])+ " " + lw_min + ")" + "\n"
            json_stats_data.update({"Símbolo menos repetido": "'"+str(min(letters_dict.iteritems(), key=operator.itemgetter(1))[0])+ "' ("+ str(min(letters_dict.iteritems(), key=operator.itemgetter(1))[1])+ " " + lw_min + ")"}) 
            print " [+] Total palabras:", str(sum(words_dict.values())) + " (diferentes: "+ str(len(words_dict)) + ")"  
            json_stats_data.update({"Total palabras": str(sum(words_dict.values())) + " (diferentes: "+ str(len(words_dict)) + ")" }) 
            if max(words_dict.iteritems(), key=operator.itemgetter(1))[1] < 2:
                lw_max = "vez"
            else:
                lw_max = "veces"
            print " [+] Palabra más repetida: '"+ str(max(words_dict.iteritems(), key=operator.itemgetter(1))[0]).encode('utf-8').strip() + "' ("+ str(max(words_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"
            json_stats_data.update({"Palabra más repetida":  "'"+str(max(words_dict.iteritems(), key=operator.itemgetter(1))[0])+ "' ("+ str(max(words_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"}) 
            if max(words_3_dict.iteritems(), key=operator.itemgetter(1))[1] < 2:
                lw_max = "vez"
            else:
                lw_max = "veces"
            print "      - 3 letras: '"+ str(max(words_3_dict.iteritems(), key=operator.itemgetter(1))[0]).encode('utf-8').strip() + "' ("+ str(max(words_3_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"
            json_stats_data.update({"Palabra más repetida (3 letras)":  "'"+str(max(words_3_dict.iteritems(), key=operator.itemgetter(1))[0])+ "' ("+ str(max(words_3_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"}) 
            if max(words_4_dict.iteritems(), key=operator.itemgetter(1))[1] < 2:
                lw_max = "vez"
            else:
                lw_max = "veces"
            print "      - 4 letras: '"+ str(max(words_4_dict.iteritems(), key=operator.itemgetter(1))[0]).encode('utf-8').strip() + "' ("+ str(max(words_4_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"
            json_stats_data.update({"Palabra más repetida (4 letras)":  "'"+str(max(words_4_dict.iteritems(), key=operator.itemgetter(1))[0])+ "' ("+ str(max(words_4_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"}) 
            if max(words_5_dict.iteritems(), key=operator.itemgetter(1))[1] < 2:
                lw_max = "vez"
            else:
                lw_max = "veces"
            print "      - 5 letras: '"+ str(max(words_5_dict.iteritems(), key=operator.itemgetter(1))[0]).encode('utf-8').strip() + "' ("+ str(max(words_5_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"
            json_stats_data.update({"Palabra más repetida (5 letras)":  "'"+str(max(words_5_dict.iteritems(), key=operator.itemgetter(1))[0])+ "' ("+ str(max(words_5_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"}) 
            if max(words_6_dict.iteritems(), key=operator.itemgetter(1))[1] < 2:
                lw_max = "vez"
            else:
                lw_max = "veces"
            print "      - 6 letras: '"+ str(max(words_6_dict.iteritems(), key=operator.itemgetter(1))[0]).encode('ISO-8859-1').strip() + "' ("+ str(max(words_6_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"
            json_stats_data.update({"Palabra más repetida (6 letras)":  "'"+str(max(words_6_dict.iteritems(), key=operator.itemgetter(1))[0])+ "' ("+ str(max(words_6_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"}) 
            if max(words_7_dict.iteritems(), key=operator.itemgetter(1))[1] < 2:
                lw_max = "vez"
            else:
                lw_max = "veces"
            print "      - 7 letras: '"+ str(max(words_7_dict.iteritems(), key=operator.itemgetter(1))[0]).encode('utf-8').strip() + "' ("+ str(max(words_7_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"
            json_stats_data.update({"Palabra más repetida (7 letras)":  "'"+str(max(words_7_dict.iteritems(), key=operator.itemgetter(1))[0])+ "' ("+ str(max(words_7_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"}) 
            if max(words_8_dict.iteritems(), key=operator.itemgetter(1))[1] < 2:
                lw_max = "vez"
            else:
                lw_max = "veces"
            print "      - 8+ letras: '"+ str(max(words_8_dict.iteritems(), key=operator.itemgetter(1))[0]).encode('utf-8').strip() + "' ("+ str(max(words_8_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")" + "\n"
            json_stats_data.update({"Palabra más repetida (8 letras)":  "'"+str(max(words_8_dict.iteritems(), key=operator.itemgetter(1))[0])+ "' ("+ str(max(words_8_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"}) 
            if min(words_dict.iteritems(), key=operator.itemgetter(1))[1] < 2:
                lw_min = "vez"
            else:
                lw_min = "veces"
            print " [+] Palabra menos repetida: '"+ str(min(words_dict.iteritems(), key=operator.itemgetter(1))[0]).encode('utf-8').strip() + "' ("+ str(min(words_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_min + ")"
            json_stats_data.update({"Palabra menos repetida":  "'"+str(min(words_dict.iteritems(), key=operator.itemgetter(1))[0]) + "' ("+ str(min(words_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_min + ")"}) 
            if max(words_8_dict.iteritems(), key=len)[1] < 2:
                lw_max = "vez"
            else:
                lw_max = "veces"
            print " [+] Palabra más larga y que más se usa: '"+ str(max(words_8_dict.iteritems(), key=len)[0]).encode('ISO-8859-1').strip() + "' ("+ str(max(words_8_dict.iteritems(), key=len)[1]) + " " + lw_max + ")" + "\n"
            json_stats_data.update({"Palabra más larga y que más se usa":  "'"+str(max(words_8_dict.iteritems(), key=len)[0]).encode('ISO-8859-1').strip() + "' ("+ str(max(words_8_dict.iteritems(), key=len)[1]).encode('ISO-8859-1').strip() + " " + lw_max + ")"}) 
            if self.options.checkverbs: 
                verb_flag = False
                num = 0
                print "[Info] Analizando (requiere tiempo!) en busca de: 'verbos infinitivos'...\n"
                for verb in self.verbs:
                    num = num + 1
                    verb_flag = self.check_verb(verb) # re-check previous list of verbs (online!)
                    if verb_flag is True: # is a verb
                        if verb in verbs_dict:
			    verbs_counter = verbs_counter + 1  
                            g = int(verbs_dict[verb]) 
                        else:
                            g = 1 
                        verbs_counter = verbs_counter + g
                        verbs_dict.update({verb:verbs_counter}) 
                        verbs_counter = 0
            if not verbs_dict:
                num_verbs = 0
                dif_verbs = 0
            else:
                num_verbs = str(sum(verbs_dict.values()))
                dif_verbs = str(len(verbs_dict))
            if verbs_dict:
                if self.options.checkverbs:
                    print " [+] Total verbos (infinitivos):", str(self.total_verbs) + " (diferentes: "+ str(dif_verbs) + ")"
                    json_stats_data.update({"Total verbos (infinitivos)": str(self.total_verbs) + " (diferentes: "+ str(dif_verbs) + ")"}) 
                    if max(verbs_dict.iteritems(), key=operator.itemgetter(1))[1] < 2:
                        lw_max = "vez"
                    else:
                        lw_max = "veces"
                    print " [+] Verbo (infinitivo) más utilizado: '"+ str(max(verbs_dict.iteritems(), key=operator.itemgetter(1))[0])+ "' ("+ str(max(verbs_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"
                    json_stats_data.update({"Verbo (infinitivo) más utilizado": str(max(verbs_dict.iteritems(), key=operator.itemgetter(1))[0])+ "' ("+ str(max(verbs_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"}) 
                    if min(verbs_dict.iteritems(), key=operator.itemgetter(1))[1] < 2:
                        lw_min = "vez"
                    else:
                        lw_min = "veces"
                    print " [+] Verbo (infinitivo) menos repetido: '"+ str(min(verbs_dict.iteritems(), key=operator.itemgetter(1))[0]) + "' ("+ str(min(verbs_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_min + ")" + "\n"
                    json_stats_data.update({"Verbo (infinitivo) menos repetido": str(min(verbs_dict.iteritems(), key=operator.itemgetter(1))[0]) + "' ("+ str(min(verbs_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_min + ")"}) 
            for a in self.author_news_stream:
                if a in authors_dict:
		    authors_counter = authors_counter + 1  
                    g = int(authors_dict[a]) 
                else:
                    g = 1 
                authors_counter = authors_counter + g
                authors_dict.update({a:authors_counter}) 
                authors_counter = 0
            print " [+] Total periodistas:", str(len(authors_dict.items()))
            json_stats_data.update({"Total periodistas": str(len(authors_dict.items()))}) 
            print " [+] Noticias por periodista:"
            sep = "/"
            num = 0
            for a, c in sorted(authors_dict.items(), key=lambda x: x[1], reverse=True):
                num = num + 1
                if c < 2:
                    lg = "noticia"
                else:
                    lg = "noticias"
                a = unicode(a)
                a = str(a.encode('utf-8').encode('ISO-8859-1'))
                print "      - '"+str(a).title() + "' ("+ str(c) + " " + lg + ")"
                json_stats_data["Autor(a)-["+str(num)+"]"] = str(a).title() + "' ("+ str(c) + " " + lg + ")"
            json_stats.write(json.dumps(json_stats_data, sort_keys=True, indent=2, separators=(',', ':'), ensure_ascii=False)) # write stats to json file
            json_stats.close() # close .json                   

    def search(self):
        print "\n[Info] Generando índice de búsqueda...\n"
        all_art_stored = self.count_all_stored()
        if all_art_stored == 0:
            print '-'*25
            print "\n[Info] Necesitas extraer (-e) antes los datos, desde las fuentes.\n" 
            print "[Info] Tienes el almacén vacío. Saliendo...\n" 
            return
        else:
            print "-"*25
        self.generate_data_stream() # generate a 'buffer' stream with all records (using json files)
        term_reply = str(raw_input("\n $ Introduce una palabra (ej: corrupción): "))
        term_reply = " " + term_reply + " " # parse term_reply to use it as a single word
        counter_term = 0
        counter_term_body = 0
        counter_term_entry = 0
        counter_term_title = 0
        titles_stream = []
        for n in self.body_news_stream:
            if term_reply in n:
                counter_term_body = counter_term_body + 1
        for e in self.entry_news_stream:
            if term_reply in e:
                counter_term_entry = counter_term_entry + 1
        for t in self.title_news_stream:
            if term_reply in t:
                counter_term_title = counter_term_title + 1
                titles_stream.append(t)       
        counter_term = counter_term_body + counter_term_entry + counter_term_title
        if counter_term < 2 and counter_term > 0:
            ct = "vez"
        elif counter_term == 0:
            ct = "veces"
        else:
            ct = "veces"
        if counter_term_body < 2 and counter_term_body > 0:
            cb = "artículo"
        elif counter_term_body == 0:
            cb = "artículos"
        else:
            cb = "artículos"
        if counter_term_entry < 2 and counter_term_entry > 0:
            ce = "entrada"
        elif counter_term_entry == 0:
            ce = "entradas"
        else:
            ce = "entradas"
        if counter_term_title < 2 and counter_term_title > 0:
            cl = "titular"
        elif counter_term_title == 0:
            cl = "titulares"
        else:
            cl = "titulares"
        if self.options.tsource:
            print "\n" + '-'*25
            print "\n[Info] Mostrando resultados en:", str(self.options.tsource)
        print "\n [+] Aparece en: ("+str(counter_term_body)+" "+str(cb)+"), ("+str(counter_term_title)+" "+str(cl)+") y ("+str(counter_term_entry)+" "+str(ce)+")"
        print " [+] Sale un total de: ("+str(counter_term)+" "+str(ct)+")" + "\n"

    def run(self, opts=None):
        options = self.create_options(opts)
        self.set_options(options)
        options = self.options
        if options.update: # update tool
            self.banner()
            try:
                print("\n [Info] Tratando de actualizar a la última versión estable:\n")
                Updater()
            except:
                print("\n [Error] Algo ha ido mal!. Para hacer funcionar ésta opción, deberías clonar Propagare lanzando:\n")
                print("  $ git clone https://github.com/epsylon/propagare\n")
        if options.view_media: # show supported sources
            self.banner()
            print("\n[Info] Listado de fuentes soportadas:\n")
            n = 0
            for m in self.supported_media:
                n = n + 1
                print "    + ["+str(n)+"]:", m
            print "" # zen out
        if options.stats: # show archive stats
            self.banner()
            self.stats()
        if options.term: # start a 'semantic' search for a term
            self.banner()
            self.search()
        if options.news: # extract news (general run)
            try: 
                self.banner()
                print "\n[Info] Buscando las fuentes de datos...\n"
                print "[Info] Examinando el contenido en línea...\n"
                sep='=='
                sep2='?'
                art_url=''
                art_url_list=[]
                art_title=''
                art_author=''
                art_location=''
                num=0
                flag=True
                if options.esource: # user has set a specific source
                    if options.esource in self.supported_media: # source is supported
                        self.supported_media = options.esource
                    else:
                        print "-"*25
                        print('\n[Error] La fuente que has indicado no está soportada! \n')
                        print "-"*25
                        print("\n[Info] Listado de fuentes soportadas :\n")
                        n = 0
                        for m in self.supported_media:
                            n = n + 1
                            print "    + ["+str(n)+"]:", m
                        print "" # zen out
                        return
                self.create_sources_list()
                for n in self.sources: # n = news media source
                    if n.endswith(""):
                        n_url = n.replace("", "/")
                    if not n.startswith("http"):
                        if n == "elmundo.es": # this media only supports access using www
                            n_url = "https://www." + n
                        else:
                            n_url = "https://" + n # SSL only
                    print "- Visitando:", n_url
                    self.user_agent = random.choice(self.agents).strip() # suffle user-agent
                    headers = {'User-Agent' : self.user_agent, 'Referer' : self.referer} # set fake user-agent and referer
                    try:
                        reply = urllib2.urlopen(n_url, context=self.ctx).read()
                    except: 
                        print('\n[Error] - Imposible conectar con: ') + n
                        pass
                    f = open('sources/'+ n)
                    regex = f.readlines()
                    f.close()
                    #print reply # nice to have this output for dev new modules
                    for r in regex: # extract specific keywords from news: time, author, url (+variations), title, description, body
                        if ('art_url==' or 'art_url2==') in r:
                            art_url = r
                            regex_art_url = str(art_url.split(sep, 1)[1]) # regex magics (art_url)
                            pattern_art_url = re.compile(regex_art_url)
                        if 'art_author==' in r:
                            art_author = r
                            regex_art_author = str(art_author.split(sep, 1)[1]) # regex magics (art_author)
                            pattern_art_author = re.compile(regex_art_author)
                        if 'art_time==' in r:
                            art_time = r
                            regex_art_time = str(art_time.split(sep, 1)[1]) # regex magics (art_time)
                            pattern_art_time = re.compile(regex_art_time)
                        if 'art_title==' in r:
                            art_title = r
                            regex_art_title = str(art_title.split(sep, 1)[1]) # regex magics (art_title)
                            pattern_art_title = re.compile(regex_art_title)
                        if 'art_description==' in r:
                            art_description = r
                            regex_art_description = str(art_description.split(sep, 1)[1]) # regex magics (art_description)
                            pattern_art_description = re.compile(regex_art_description)
                        if 'art_body==' in r:
                            art_body = r
                            regex_art_body = str(art_body.split(sep, 1)[1]) # regex magics (art_body)
                            pattern_art_body = re.compile(regex_art_body, re.MULTILINE)
                    try:
                        art_url_found = re.findall(pattern_art_url, reply) # found art_url patterns on main page
                        art_url_parsed = self.check_art_repetitions(n, art_url_found) # discard results previously stored
                    except: 
                        art_url_parsed = None
                        pass
                    art_stored = self.count_art_stored(n)
                    if not art_url_parsed and not art_stored:
                        print "\n[Info] Nuevos artículos encontrados: 0 | Total artículos almacenados (de ésta fuente): 0\n"
                        return
                    if not art_url_parsed and art_stored > 0: # not any new article found + some articles stored 
                        pass
                    elif len(art_url_parsed) is 0 and art_stored is 0: # not any new article found + not any article stored
                        print "\n[Info] Nuevos artículos encontrados: 0 | Total artículos almacenados (de ésta fuente): " + str(art_stored) + "\n"
                        return
                    else: # new article found
                        print "" # zen out
                        for a in art_url_parsed:
                            if "elmundo.es" in n: # re-parsing website: elmundo.es [10/05/2018]
                                if '"' in a:
                                    a = str(a.split('"', 1)[0])
                                if '#' in a:
                                    a = str(a.split('#', 1)[0])
                                if not "http" in a or not "www.elmundo.es" in a or "vivienda" in a or "horoscopo" in a or "menu" in a or "?" in a or "indices" in a or "programacion-tv" in a:
                                    a = None
                            if "eldiario.es" in n: # re-parsing website: eldiario.es [09/05/2018]
                                if '" title="' in a:
                                    a = str(a.split('"', 1)[0])
                                if '">' in a:
                                    a = str(a.split('"', 1)[0])
                                if "rastreador" in a or "http" in a or "autores" in a or "www.eldiario.es" in a or "/carnecruda" in a or "/contenido_patrocinado" in a:
                                    a = None
                            if "elpais.com" in n: # re-parsing website: elpais.com [24/04/2018]
                                if "?" in a:
                                    a = str(a.split(sep2, 1)[0])
                                if "posicionador" in a:
                                    a = a.replace('" class="posicionador',"")
                                if "elpais.com" in a:
                                    a = "https:/" + a
                                else:
                                    a = n_url + "/" + a
                            if a is not None:
                                if "eldiario.es" in n: # re-parsing website: eldiario.es [09/05/2018]
                                    a = "https://eldiario.es" + a
                                if a not in art_url_list:
                                    check_pass = self.check_art_exists(a, n) # check if art found is previously stored for this media
                                    if check_pass is True or check_pass is None:
                                        art_url_list.append(a) # crawlered pages from main website
                                        print "    + [IA]:", a
                        if not art_url_list:
                            pass
                        else:
                            print "\n[Info] Nuevos artículos encontrados: " + str(len(art_url_list)) + " | Total artículos almacenados (de ésta fuente): " + str(art_stored) + "\n"
                            print "- Extrayendo:", n_url
                            if not os.path.exists('data/'):
                                os.makedirs('data/')
                            if not os.path.exists('data/' + n):
                                os.makedirs('data/' + n)
                            for a in art_url_list:
                                num=num+1 # art counter
                                json_data = {} # json dict stream buffer
                                if '"' in a: # re-parse url searching for " after it
                                    sep = '"'
                                    a = str(a.split(sep, 1)[0])
                                print "\n    + ["+str(num)+"/"+str(len(art_url_list))+"] Visitando:", a
                                self.user_agent = random.choice(self.agents).strip() # suffle user-agent
                                headers = {'User-Agent' : self.user_agent, 'Referer' : self.referer} # set user-agent and referer
                                try:
                                    reply_art = urllib2.urlopen(a, context=self.ctx).read()
                                except: 
                                    print('\n[Error] - Imposible conectar con: ') + a
                                    return
                                art_url_author_found = re.findall(pattern_art_author, reply_art) # found art_author pattern on page
                                if not art_url_author_found:
                                    for r in regex: # extract another combination
                                        if 'art_author2==' in r:
                                            art_author = r
                                            try:
                                                regex_art_author = str(art_author.split(sep, 1)[1]) # re-regex magics (art_author)
                                                pattern_art_author = re.compile(regex_art_author)
                                            except:
                                                break
                                    art_url_author_found = re.findall(pattern_art_author, reply_art) # try another art_author pattern on page
                                if not art_url_author_found: # not any author found using regex (use default for each media)
                                    if "elmundo.es" in n: # default author for elmundo.es when not signed
                                        art_url_author_found.append("Ediciones El Mundo")
                                    if "elpais.com" in n: # default author for elpais.com when not signed
                                        art_url_author_found.append("Ediciones El País")
                                    if "eldiario.es" in n: # default author for elpais.com when not signed
                                        art_url_author_found.append("Ediciones El Diario")
                                else:
                                    if "elpais.com" in n: # based on specific reg exp.
                                        sep = '"'
                                        for author in art_url_author_found:
                                            art_url_author_found.remove(author)
                                            author = str(author.split(sep, 1)[0])
                                            art_url_author_found.append(author)
                                art_url_time_found = re.findall(pattern_art_time, reply_art) # found art_time pattern on page
                                art_url_title_found = re.findall(pattern_art_title, reply_art) # found art_title pattern on page
                                art_url_description_found = re.findall(pattern_art_description, reply_art) # found art_description pattern on page 
                                art_url_body_found = re.findall(pattern_art_body, reply_art) # found art_body pattern on page (MULTILIN)
                                if not art_url_body_found: # not any body found
                                    if "eldiario.es" in n:
                                        for r in regex: # extract another combination
                                            if 'art_body2==' in r:
                                                art_body = r
                                                regex_art_body = str(art_body.split(sep, 1)[1])
                                                pattern_art_body = re.compile(regex_art_body)
                                        art_url_body_found = re.findall(pattern_art_body, reply_art) 
                                time.sleep(0.1) # tic, tac!!!
                                self.update_progress("\n      - ETA", num, len(art_url_list))
                                print "" # zen out
                                if "elmundo.es" in a: # [10/05/2018] # schema: https://www.elmundo.es/category/{tag}/date/ID
                                    a_path = a.replace("http://www.elmundo.es/","") # remove pre-url (note: http)
                                    a_path = a_path.replace(".html","") # remove post-url
                                if "elpais.com" in a: # [24/04/2018] # schema: https://elpais.com/category/date/tag/ID
                                    a_path = a.replace("https://elpais.com/","") # remove pre-url
                                    a_path = a_path.split(".html") # remove post-url
                                    if "elpais.com" in a_path: # re-parsing url [24/04/2018] # schema: https://loc.elpais.com/...
                                        a_path = a_path.split("elpais.com/")
                                        a_path = a_path[1]
                                if "eldiario.es" in a: # [08/05/2018] # schema: https://eldiario.es/category/{tag}/ID
                                    a_path = a.replace("https://eldiario.es/","") # remove pre-url
                                    a_path = a_path.replace(".html","") # remove post-url
                                if "/" in a_path: # / mostly used like url-category sep keyword
                                    a_path = a_path.split("/")
                                    if "elmundo.es" in a:
                                        try: # try with /tag/
                                            category = a_path[0]
                                            date_year = a_path[2]
                                            date_month = a_path[3]
                                            date_day = a_path[4]
                                            date = date_year + "_" + date_month + "_" + date_day # date: year/month/day
                                            tag = a_path[1]
                                            ID = a_path[5]
                                            if not os.path.exists('data/'+n+"/"+category):
                                                os.makedirs('data/'+n+"/"+category)
                                            if not os.path.exists('data/'+n+"/"+category+"/"+date):
                                                os.makedirs('data/'+n+"/"+category+"/"+date)
                                            if not os.path.exists('data/'+n+"/"+category+"/"+date+"/"+tag): # create new record
                                                os.makedirs('data/'+n+"/"+category+"/"+date+"/"+tag)
                                            path = 'data/'+n+"/"+category+"/"+date+"/"+tag+"/"+ID+".txt" # set path to file 
                                            self.generate_json(n, category, date, tag, ID) # generate .json  
                                        except:
                                            try:
                                                category = a_path[0]
                                                date_year = a_path[1]
                                                date_month = a_path[2]
                                                date_day = a_path[3]
                                                date = date_year + "_" + date_month + "_" + date_day # date: year/month/day
                                                ID = a_path[4]
                                                if not os.path.exists('data/'+n+"/"+category):
                                                    os.makedirs('data/'+n+"/"+category)
                                                if not os.path.exists('data/'+n+"/"+category+"/"+date):
                                                    os.makedirs('data/'+n+"/"+category+"/"+date)
                                                path = 'data/'+n+"/"+category+"/"+date+"/"+ID+".txt" # set path to file 
                                                self.generate_json(n, category, date, None, ID) # generate .json
                                            except:
                                                pass
                                    if "eldiario.es" in a:
                                        category = a_path[0]
                                        ID = a_path[1]
                                        if not os.path.exists('data/'+n+"/"+category):
                                            os.makedirs('data/'+n+"/"+category)
                                        if not os.path.exists('data/'+n+"/"+category+"/"+ID):
                                            os.makedirs('data/'+n+"/"+category+"/"+ID)
                                        path = 'data/'+n+"/"+category+"/"+ID+"/"+ID+".txt" # set path to file 
                                        self.generate_json(n, category, None, None, ID) # generate .json         
                                    if "elpais.com" in a:
                                        category = a_path[0]
                                        date_year = a_path[1] 
                                        date_month = a_path[2]
                                        date_day = a_path[3] 
                                        date = date_year + "_" + date_month + "_" + date_day # date: year/month/day
                                        tag = a_path[4]
                                        ID = a_path[5]
                                        if not os.path.exists('data/'+n+"/"+category):
                                            os.makedirs('data/'+n+"/"+category)
                                        if not os.path.exists('data/'+n+"/"+category+"/"+date):
                                            os.makedirs('data/'+n+"/"+category+"/"+date)
                                        if not os.path.exists('data/'+n+"/"+category+"/"+date+"/"+tag):
                                            os.makedirs('data/'+n+"/"+category+"/"+date+"/"+tag)
                                        if not os.path.exists('data/'+n+"/"+category+"/"+date+"/"+tag+"/"+ID): # create new record
                                            os.makedirs('data/'+n+"/"+category+"/"+date+"/"+tag+"/"+ID)  
                                        path = 'data/'+n+"/"+category+"/"+date+"/"+tag+"/"+ID+"/"+ID+".txt" # set path to file 
                                        self.generate_json(n, category, date, tag, ID) # generate .json
                                    try:
                                        fs = open(path, "w") # generate .txt
                                    except:
                                        print "\n[Error] No se pueden extraer las noticias de manera correcta. Saliendo...\n"
                                        return
                                    fs.write("Fuente: " + str(a).encode('utf-8') + "\n") # write source url
                                    json_data.update({"Fuente": str(a)})
                                    for t in art_url_time_found:
                                        fs.write("Fecha de publicación: " + str(t).encode('utf-8') + "\n") # write time
                                        json_data.update({"Fecha de publicación": str(t)})
                                    for author in art_url_author_found:
                                        if "\t" in author:
                                            author = author.split('\t', 1)[1] # re-parse for \t
                                        author = str(author.decode('ISO-8859-1').strip())
                                        fs.write("Autor(a): " + str(author).encode('utf-8') + "\n") # write author
                                        json_data.update({"Autor(a)": str(author)})
                                    for title in art_url_title_found:
                                        title = str(title.decode('ISO-8859-1').strip())
                                        parsed = self.format_content(str(title)) 
                                        fs.write("Titular: " + str(parsed).encode('utf-8') + "\n") # write title
                                        json_data.update({"Titular": str(parsed)})
                                    for description in art_url_description_found:
                                        description = str(description.decode('ISO-8859-1').strip())
                                        parsed = self.format_content(str(description))
                                        fs.write("Entrada: " + str(parsed).encode('utf-8') + "\n") # write description
                                        json_data.update({"Entrada": str(parsed)})
                                    body_complete = ""
                                    for body in art_url_body_found:
                                        body = str(body.decode('ISO-8859-1').strip())
                                        if "elmundo.es" in a:
                                            body = body.split("TE PUEDE INTERESAR",1)[0]
                                            if "###" in body:
                                                body = body.relace("###","")
                                        if "elpais.com" in a: 
                                            body = body.replace("<span>Explora nuestras historias</span> por temas","")
                                            body = body.replace("Recibe nuestra newsletter", "")
                                        body_complete += body + "\n\n"
                                        if "elmundo.es" in a:
                                            break
                                    parsed = self.format_content(body_complete) 
                                    fs.write("\n" + str(parsed).encode('utf-8')) # write (plain text) body without keyword
                                    json_data.update({"Noticia": str(parsed)})
                                    self.json_report.write(json.dumps(json_data, sort_keys=True, indent=2, separators=(',', ':'), ensure_ascii=False)) # json dump
                                    fs.close() # close .txt
                                    self.json_report.close() # close .json
                                    self.total_num = self.total_num + 1
                    num = 0 # flush art found counter
                    art_url_list = [] # flush art list
                all_art_stored = self.count_all_stored()
                if self.total_num:
                    print "" # zen out
                else:
                    self.total_num = 0
                print "\n[Info] Nuevos artículos descargados: " + str(self.total_num) + " | Total artículos almacenados (de todas las fuentes): " + str(all_art_stored) + "\n"
                if all_art_stored > 0:  
                    if not self.options.forceno:
                        print "-"*25
                        stats_reply = raw_input("¿Quieres ver las estadísticas comparadas (S/n)?\n")
                    else:
                        stats_reply = "N"
                    if stats_reply == "s" or stats_reply == "S":
                        self.stats()
                    else:
                        print "\n[Info] Saliendo...\n"
                        return
                    if not self.options.forceno:
                        print "-"*25
                        search_reply = raw_input("¿Quieres buscar información semántica (S/n)?\n")
                    else:
                        search_reply = "N"
                    if search_reply == "s" or search_reply == "S":
                        self.search()
                    else:
                        print "\n[Info] Saliendo...\n"
            except (KeyboardInterrupt, SystemExit):
                sys.exit()

if __name__ == "__main__":
    app = Propagare()
    app.run()