7 년 전 · 6ab9cbab8f
--- a/README.md
+++ b/README.md
@@ -1,3 +1,70 @@
 
				-# propagare
			
 
				+  ![Propagare](https://03c8.net/images/propagare2.png "Propagare")
			
 
				 
			
 
				-Propagar(es): extracción, organización y análisis semántico de noticias.

			
 
				+----------
			
 
				+
			
 
				+ + Web:  https://propaganda.is
			
 
				+
			
 
				+ + Código:  https://github.com/epsylon/propagare
			
 
				+
			
 
				+----------
			
 
				+
			
 
				+  Propagare  - es una herramienta de extracción, organización y análisis semántico de noticias.
			
 
				+
			
 
				+----------
			
 
				+
			
 
				+#### Instalando:
			
 
				+
			
 
				+  Propagare es multiplataforma. Requiere Python (>2.7.9) y las siguientes librerías:
			
 
				+
			
 
				+       python-html2text - Python module for converting HTML to Markdown text
			
 
				+
			
 
				+  En sistemas basados en Debian (ej: Ubuntu), puedes instalarla mediante: 
			
 
				+
			
 
				+       sudo apt-get install python-html2text
			
 
				+
			
 
				+  O también puedes usar un script de instalación automática (adjunto con la herramienta), ejecutando:
			
 
				+
			
 
				+       python setup.py install
			
 
				+
			
 
				+  Incluso utilizando 'pip':
			
 
				+
			
 
				+       pip install html2text
			
 
				+
			
 
				+####  Fuentes de las librerías:
			
 
				+
			
 
				+   * Python: https://www.python.org/downloads/
			
 
				+   * Pypi-html2text: https://pypi.org/project/html2text/
			
 
				+
			
 
				+----------
			
 
				+
			
 
				+####  Licencia:
			
 
				+
			
 
				+  Propagare es distribuída con la licencia GPLv3. Puede revisarla en: [LICENSE](./docs/LICENSE).
			
 
				+
			
 
				+----------
			
 
				+
			
 
				+####  Opciones
			
 
				+
			
 
				+       --version      muestra la versión del programa
			
 
				+       -h, --help     muestra la ayuda
			
 
				+
			
 
				+       --update       actualiza la herramienta a su última versión
			
 
				+
			
 
				+       --list         lista las fuentes de noticias soportadas
			
 
				+
			
 
				+       -e             extrae noticias de todas las fuentes
			
 
				+       --es=ESOURCE   especifíca una fuente de donde extraer noticias
			
 
				+       --force-no     utilízalo para almacenar datos (Big Data)
			
 
				+
			
 
				+       -s             muestra algunas estadísticas interesantes
			
 
				+       --ss=SSOURCE   especifíca una fuente de donde extraer estadísticas
			
 
				+       --check-verbs  chequea si una palabra es un verbo (Experimental)
			
 
				+
			
 
				+       -t             busca el término en el almacén
			
 
				+       --ts=TSOURCE   busca el término en una fuente determinada
			
 
				+
			
 
				+----------
			
 
				+
			
 
				+  ![Propagare](https://03c8.net/images/propagare.png "Propagare")
			
 
				+
			
 
				+  ![Propagare](https://03c8.net/images/propagare3.png "Propagare")
			
--- a/propagare/core/__init__.py
+++ b/propagare/core/__init__.py
@@ -0,0 +1,14 @@
 
				+#!/usr/bin/env python 
			
 
				+# -*- coding: utf-8 -*-"
			
 
				+"""
			
 
				+Propagare - 2018 - by psy (epsylon@riseup.net)
			
 
				+
			
 
				+-------
			
 
				+You should have received a copy of the GNU General Public License along
			
 
				+with Propagare; if not, write to the Free Software Foundation, Inc., 51
			
 
				+Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
			
 
				+-------
			
 
				+
			
 
				+"A diferencia del odio, el amor se expande sin necesidad de propaganda..."
			
 
				+
			
 
				+"""
			
--- a/propagare/core/main.py
+++ b/propagare/core/main.py
@@ -0,0 +1,928 @@
 
				+#!/usr/bin/env python 
			
 
				+# -*- coding: utf-8 -*-"
			
 
				+"""
			
 
				+Propagare - 2018 - by psy (epsylon@riseup.net)
			
 
				+
			
 
				+-------
			
 
				+You should have received a copy of the GNU General Public License along
			
 
				+with Propagare; if not, write to the Free Software Foundation, Inc., 51
			
 
				+Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
			
 
				+-------
			
 
				+
			
 
				+"A diferencia del odio, el amor se expande sin necesidad de propaganda."
			
 
				+
			
 
				+"""
			
 
				+import os, sys, urllib2, ssl, random, re, time, string
			
 
				+import json, operator, io
			
 
				+try:
			
 
				+    import html2text
			
 
				+except:
			
 
				+    print "\n[Error] - No se encuentra la librería: html2text. \n\n Puedes instalarla ejecutando en la terminal:\n\n   sudo apt-get install python-html2text\n\n Después, prueba a lanzar de nuevo la aplicación.\n"
			
 
				+    sys.exit(2)
			
 
				+from core.options import PropagareOptions
			
 
				+from core.update import Updater
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf-8') # for decode byte (black magic!)
			
 
				+
			
 
				+DEBUG = False
			
 
				+
			
 
				+class Propagare(object):
			
 
				+    def __init__(self):
			
 
				+        self.supported_media = ["elpais.com", "eldiario.es", "elmundo.es"] # media sources modules currently implemented
			
 
				+        self.check_verb_online = "https://www.esfacil.eu/es/verbos/conjugacion.html?infinitive=" # for check spanish verbs online
			
 
				+        self.sources = [] # used for news media sources
			
 
				+        self.agents_file = 'core/txt/user-agents.txt' # set source path to retrieve user-agents
			
 
				+        self.referer = 'http://127.0.0.1/' # set referer
			
 
				+        self.agents = []
			
 
				+        f = open(self.agents_file)
			
 
				+        agents = f.readlines()
			
 
				+        f.close()
			
 
				+        for agent in agents:
			
 
				+            self.agents.append(agent) # agents are random each request
			
 
				+        self.ctx = ssl.create_default_context() # creating context to bypass SSL cert validation (black magic!)
			
 
				+        self.ctx.check_hostname = False
			
 
				+        self.ctx.verify_mode = ssl.CERT_NONE
			
 
				+        self.jdata = [] # json stream container (a list)
			
 
				+        self.verbs = [] # a list to store semantics (verbs)
			
 
				+        self.total_verbs = 0
			
 
				+        self.total_num = 0
			
 
				+
			
 
				+    def set_options(self, options):
			
 
				+        self.options = options
			
 
				+
			
 
				+    def create_options(self, args=None):
			
 
				+        self.optionParser = PropagareOptions()
			
 
				+        self.options = self.optionParser.get_options(args)
			
 
				+        if not self.options:
			
 
				+            return False
			
 
				+        return self.options
			
 
				+
			
 
				+    def remove_punctuation(self, news):
			
 
				+        p = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
			
 
				+        no_p = ""
			
 
				+        for c in news:
			
 
				+            if c not in p:
			
 
				+                no_p = no_p + c
			
 
				+        news = no_p           
			
 
				+        return news
			
 
				+
			
 
				+    def update_progress(self, t, p, l):
			
 
				+        b = p*100/l
			
 
				+        msg = "\r{0}: [{1}] {2}%".format(t, "#"*p + "-"*(l-p), round(b,2))
			
 
				+        if p >= 100: msg += " DONE\r\n"
			
 
				+        sys.stdout.write(msg)
			
 
				+        sys.stdout.flush()
			
 
				+
			
 
				+    def banner(self):
			
 
				+        print '='*75
			
 
				+        print "  ____                                              _       "
			
 
				+        print " |  _ \ Si no 'contesta'...(es)__ _  __ _ _ __   __| | __ _ "
			
 
				+        print " | |_) | '__/ _ \| '_ \ / _` |/ _` |/ _` | '_ \ / _` |/ _` |"
			
 
				+        print " |  __/| | | (_) | |_) | (_| | (_| | (_| | | | | (_| | (_| |"
			
 
				+        print " |_|   |_|  \___/| .__/ \__,_|\__, |\__,_|_| |_|\__,_|\__,_|"
			
 
				+        print "                 |_|          |___/                (IA)v:0.1"
			
 
				+        print "\n"+'='*75
			
 
				+        print "\n"+self.optionParser.description
			
 
				+        print "\n"+'='*75
			
 
				+
			
 
				+    @classmethod
			
 
				+    def try_running(cls, func, error, args=None):
			
 
				+        args = args or []
			
 
				+        try:
			
 
				+            return func(*args)
			
 
				+        except Exception:
			
 
				+            print(error, "error")
			
 
				+            if DEBUG == True:
			
 
				+                traceback.print_exc()
			
 
				+
			
 
				+    def generate_json(self, n, category, date, tag, ID):
			
 
				+        if "elpais.com" in n:
			
 
				+            self.json_report = open('data/'+n+"/"+category+"/"+date+"/"+tag+"/"+ID+"/"+ID+".json", "w") 
			
 
				+        if "eldiario.es" in n:
			
 
				+            self.json_report = open('data/'+n+"/"+category+"/"+ID+"/"+ID+".json", "w")
			
 
				+        if "elmundo.es" in n:
			
 
				+            if tag is None:
			
 
				+                self.json_report = open('data/'+n+"/"+category+"/"+date+"/"+ID+".json", "w")
			
 
				+            else:
			
 
				+                self.json_report = open('data/'+n+"/"+category+"/"+date+"/"+tag+"/"+ID+".json", "w")
			
 
				+
			
 
				+    def format_content(self, data):
			
 
				+        html_parser = html2text.HTML2Text()
			
 
				+        html_parser.ignore_links = True
			
 
				+        html_parser.ignore_images = True
			
 
				+        html_parser.ignore_emphasis = True
			
 
				+        html_parser.bypass_tables = True 
			
 
				+        html_parser.unicode_snob = True
			
 
				+        html_parser.skip_internal_links = True
			
 
				+        parsed = html_parser.handle(data)
			
 
				+        parsed = parsed.replace("\n", " ") # clean \n
			
 
				+        parsed = parsed.replace('\"', " ") # clean \"
			
 
				+        parsed = parsed.replace("#","") # clean #
			
 
				+        return parsed
			
 
				+
			
 
				+    def count_all_stored(self): # all sources
			
 
				+        art_stored = 0
			
 
				+        for root, dirs, files in os.walk('data/'):
			
 
				+            art_stored += len(files) 
			
 
				+        return (art_stored-1)/2 # ((txt+json)-last_log)/2
			
 
				+
			
 
				+    def count_art_stored(self, n): # by source
			
 
				+        art_stored = 0
			
 
				+        for root, dirs, files in os.walk('data/' + n):
			
 
				+            art_stored += len(files)
			
 
				+        return art_stored/2 # (txt+json)/2
			
 
				+
			
 
				+    def count_sources_list(self):
			
 
				+        media_stored = 0
			
 
				+        for root, dirs, files in os.walk("data/"):
			
 
				+            for d in dirs:
			
 
				+                if d in self.supported_media:
			
 
				+                    media_stored = media_stored + 1
			
 
				+        return media_stored
			
 
				+
			
 
				+    def check_art_exists(self, a, n):
			
 
				+        sep = ".json"
			
 
				+        for root, dirs, files in os.walk('data/' + n):
			
 
				+            for f in files:
			
 
				+                if f.endswith(".json"):
			
 
				+                    f = str(f.split(sep, 1)[0])
			
 
				+                    if str(f) in str(a):
			
 
				+                        check_pass = False
			
 
				+                        return check_pass
			
 
				+                    else:
			
 
				+                        check_pass = True
			
 
				+
			
 
				+    def create_sources_list(self):
			
 
				+        for root, dirs, files in os.walk("sources/", topdown=False):
			
 
				+            for name in files:
			
 
				+                if name not in self.sources and name in self.supported_media:
			
 
				+                    self.sources.append(name) # add name to sources list     
			
 
				+
			
 
				+    def check_art_repetitions(self, n, art_url_found):
			
 
				+        sep = "/"
			
 
				+        sep2 = ".html"
			
 
				+        sep3 = "."
			
 
				+        sep4 = "_"
			
 
				+        filenames = []
			
 
				+        for root, dirs, files in os.walk('data/' + n):
			
 
				+            for f in files:
			
 
				+                filename = os.path.basename(f)
			
 
				+                filename = str(filename.split(sep3, 1)[0])
			
 
				+                if not filename in filenames:
			
 
				+                    filenames.append(filename)
			
 
				+        if filenames:
			
 
				+            for f in filenames:
			
 
				+                for a in art_url_found:
			
 
				+                    if "eldiario.es" in n:
			
 
				+                        if str(f) in str(a):
			
 
				+                            art_url_found.remove(a)
			
 
				+                    if "elpais.com" in n:
			
 
				+                        f = str(f.split(sep3, 1)[0])
			
 
				+                        ID = str(a.split(sep, 5)[5]) 
			
 
				+                        ID = str(ID.split(sep2, 1)[0])
			
 
				+                        if sep in ID:
			
 
				+                            ID = str(ID.split(sep, 2)[2])
			
 
				+                        if str(ID) in str(f): # art stored, discard it
			
 
				+                            art_url_found.remove(a)
			
 
				+        return art_url_found
			
 
				+
			
 
				+    def is_a_verb(self, w):
			
 
				+        if w.endswith("ar") or w.endswith("er") or w.endswith("ir"): # (spanish: inifitive verb)
			
 
				+             self.total_verbs = self.total_verbs + 1
			
 
				+             self.verbs.append(w) # add verb to list 
			
 
				+
			
 
				+    def check_verb(self, verb):
			
 
				+        check_verb_online = str(self.check_verb_online) + verb # url + verb
			
 
				+        self.user_agent = random.choice(self.agents).strip() # suffle user-agent
			
 
				+        headers = {'User-Agent' : self.user_agent, 'Referer' : self.referer} # set fake user-agent and referer
			
 
				+        try:
			
 
				+            reply = urllib2.urlopen(check_verb_online, context=self.ctx).read()
			
 
				+        except: 
			
 
				+            print('\n[Error] - Imposible conectar con: ') + check_verb_online + '\n'
			
 
				+            return False
			
 
				+        if "¡Verbo no válido!" in reply or "¡Verbo no encontrado!" in reply: # working at: 30/04/2018
			
 
				+            return False
			
 
				+        else:
			
 
				+            return True
			
 
				+
			
 
				+    def generate_data_stream(self):
			
 
				+        if self.options.ssource:
			
 
				+            if self.options.ssource in self.supported_media: # source is supported
			
 
				+                source = 'data/' + self.options.ssource
			
 
				+            else:
			
 
				+                print "-"*25
			
 
				+                print('\n[Error] La fuente indicada de estadísticas no está soportada! \n')
			
 
				+                print "-"*25
			
 
				+                print("\n[Info] Listado de fuentes soportadas :\n")
			
 
				+                n = 0
			
 
				+                for m in self.supported_media:
			
 
				+                    n = n + 1
			
 
				+                    print "    + ["+str(n)+"]:", m
			
 
				+                print "" # zen out
			
 
				+                sys.exit(2)
			
 
				+        else:
			
 
				+            source = 'data/'
			
 
				+        if self.options.tsource:
			
 
				+            if self.options.tsource in self.supported_media: # source is supported
			
 
				+                source = 'data/' + self.options.tsource
			
 
				+            else:
			
 
				+                print('\n[Error] La fuente indicada de búsqueda no está soportada! \n')
			
 
				+                print "-"*25
			
 
				+                print("\n[Info] Listado de fuentes soportadas :\n")
			
 
				+                n = 0
			
 
				+                for m in self.supported_media:
			
 
				+                    n = n + 1
			
 
				+                    print "    + ["+str(n)+"]:", m
			
 
				+                print "" # zen out
			
 
				+                sys.exit(2)
			
 
				+        else:
			
 
				+            if not self.options.ssource:
			
 
				+                source = 'data/'
			
 
				+        for root, dirs, files in os.walk(source): # generate stream for analisis             
			
 
				+            for fl in files:
			
 
				+                if fl.endswith(".json"): # extract content from json archives
			
 
				+                    p=os.path.join(root,fl)
			
 
				+                    kf = io.open(os.path.abspath(p), encoding='utf-8') 
			
 
				+                    try:
			
 
				+                        data = str(kf.read().encode('utf-8'))
			
 
				+                    except:
			
 
				+                        data = kf.read()
			
 
				+                    try:
			
 
				+                        self.jdata.append(json.loads(data))
			
 
				+                    except:
			
 
				+                        pass
			
 
				+                    kf.close()
			
 
				+        if not self.jdata:
			
 
				+            print "\n[Info] Necesitas extraer (-e) antes los datos, desde las fuentes.\n"
			
 
				+            print "[Info] Tienes el almacén vacío. Saliendo...\n"
			
 
				+            sys.exit(2) # return
			
 
				+        self.body_news_stream = []
			
 
				+        self.dates_news_stream = []
			
 
				+        self.author_news_stream = []
			
 
				+        self.entry_news_stream = []
			
 
				+        self.title_news_stream = []
			
 
				+        #self.url_news_stream = []
			
 
				+        for record in self.jdata:
			
 
				+            for key, value in record.iteritems(): # unpack a 'huge-stream' dict, stored on a list (a tuple) using iteritems() (black magic!)
			
 
				+                if key == "Noticia": # parse only for body content
			
 
				+                    self.body_news_stream.append(value)
			
 
				+                if key == "Fecha de publicación": # parse only for dates
			
 
				+                    self.dates_news_stream.append(value)
			
 
				+                if key == "Autor(a)": # parse only for authors
			
 
				+                    self.author_news_stream.append(value)
			
 
				+                if self.options.term: # extract more keys when searching a term
			
 
				+                    if key == "Entrada": # parse only for entry content
			
 
				+                        self.entry_news_stream.append(value)
			
 
				+                    if key == "Titular": # parse only for title
			
 
				+                        self.title_news_stream.append(value)
			
 
				+                    #if key == "Fuente": # parse only for url source
			
 
				+                    #    self.url_news_stream.append(value)
			
 
				+
			
 
				+    def stats(self):
			
 
				+        print "\n[Info] Recopilando estadísticas del almacén...\n"
			
 
				+        all_art_stored = self.count_all_stored()
			
 
				+        if all_art_stored == 0 or all_art_stored < 0:
			
 
				+            print '-'*25
			
 
				+            print "\n[Info] Necesitas extraer (-e) antes los datos, desde las fuentes.\n" 
			
 
				+            print "[Info] Tienes el almacén vacío. Saliendo...\n" 
			
 
				+            return
			
 
				+        else:
			
 
				+            print "-"*25
			
 
				+            json_stats = open('data/last_stats.json', "w") # generate json with last stats
			
 
				+            json_stats_data = {}
			
 
				+            self.generate_data_stream() # generate a 'buffer' stream with records (using json files)
			
 
				+            self.create_sources_list()
			
 
				+            media_sources = self.count_sources_list()
			
 
				+            print "\n [+] Total medios:", str(media_sources)
			
 
				+            print " [+] Total noticias:", str(all_art_stored) + "\n"
			
 
				+            json_stats_data.update({"Total medios": str(media_sources)})
			
 
				+            json_stats_data.update({"Total noticias": str(all_art_stored)})        
			
 
				+            news_letters = 0
			
 
				+            symbols = []
			
 
				+            letters_dict = {}
			
 
				+            words_dict = {}
			
 
				+            words_3_dict = {}
			
 
				+            words_4_dict = {}
			
 
				+            words_5_dict = {}
			
 
				+            words_6_dict = {}
			
 
				+            words_7_dict = {}
			
 
				+            words_8_dict = {}
			
 
				+            verbs_dict = {}
			
 
				+            authors_dict = {}
			
 
				+            letters_counter = 0
			
 
				+            words_counter = 0
			
 
				+            verbs_counter = 0
			
 
				+            authors_counter = 0
			
 
				+            for news in self.body_news_stream:
			
 
				+                news_parsed = self.remove_punctuation(str(news)) # remove punctuation signs / encode from unicode to str
			
 
				+                news_parsed_noblank = news_parsed
			
 
				+                news_parsed_noblank.replace(" ","") # data as a stream without blank spaces
			
 
				+                news_parsed_noblank = news_parsed_noblank.lower() # change stream to lowercase
			
 
				+                news_letters = news_letters + len(news_parsed_noblank)
			
 
				+                news_split = news_parsed.split()
			
 
				+                nums = "0123456789" # 0-9
			
 
				+                symbols = map(chr, range(97, 123)) # a-z (A-Z)
			
 
				+                for char in nums:
			
 
				+                    symbols.append(char)
			
 
				+                for l in symbols:
			
 
				+                    if l in news_parsed_noblank: # only count those letters that exists
			
 
				+                        letters_counter = int(letters_counter + news_parsed_noblank.count(l))
			
 
				+                        if l in letters_dict.iteritems():
			
 
				+                            g = int(letters_dict[l]) # extract previous value
			
 
				+                        else:
			
 
				+                            g = 0
			
 
				+                        lg = letters_counter + g # sum new letters counted to previous ones
			
 
				+                        letters_dict.update({l:lg}) # update dict with new value
			
 
				+                        letters_counter = 0 # flush counter
			
 
				+                for w in news_split:
			
 
				+                    w = w.lower() # change word to lowercase
			
 
				+                    if w in words_dict:
			
 
				+			words_counter = words_counter + 1  
			
 
				+                        g = int(words_dict[w]) 
			
 
				+                    else:
			
 
				+                        g = 1 
			
 
				+                    words_counter = words_counter + g
			
 
				+                    words_dict.update({w:words_counter}) 
			
 
				+                    words_counter = 0
			
 
				+                    if self.options.checkverbs: 
			
 
				+                        self.is_a_verb(w) # check for verbs (adding to a list) using semantic rules        
			
 
				+            for key, value in words_dict.iteritems():
			
 
				+                if len(key) == 3:
			
 
				+                    words_3_dict[key] = value
			
 
				+                if len(key) == 4:
			
 
				+                    words_4_dict[key] = value
			
 
				+                if len(key) == 5:
			
 
				+                    words_5_dict[key] = value
			
 
				+                if len(key) == 6:
			
 
				+                    words_6_dict[key] = value
			
 
				+                if len(key) == 7:
			
 
				+                    words_7_dict[key] = value
			
 
				+                if len(key) > 7:
			
 
				+                    words_8_dict[key] = value
			
 
				+            if self.options.ssource:
			
 
				+                print '-'*25
			
 
				+                print "\n[Info] Mostrando estadísticas para:", str(self.options.ssource) + "\n"
			
 
				+            print " [+] Noticia más antigua:",str(min(self.dates_news_stream))
			
 
				+            print " [+] Noticia más reciente:",str(max(self.dates_news_stream)) + "\n"
			
 
				+            json_stats_data.update({"Noticia más antigua": str(min(self.dates_news_stream))})
			
 
				+            json_stats_data.update({"Noticia más reciente": str(max(self.dates_news_stream))})
			
 
				+            print " [+] Total símbolos (a-Z/0-9):", str(news_letters) + " (diferentes: "+ str(len(letters_dict)) + ")"  
			
 
				+            json_stats_data.update({"Total símbolos (a-Z/0-9)": str(news_letters) + " (diferentes: "+ str(len(letters_dict)) + ")"  }) 
			
 
				+            if max(letters_dict.iteritems(), key=operator.itemgetter(1))[1] < 2:
			
 
				+                lw_max = "vez"
			
 
				+            else:
			
 
				+                lw_max = "veces"
			
 
				+            print " [+] Símbolo más utilizado: '"+str(max(letters_dict.iteritems(), key=operator.itemgetter(1))[0]).encode('utf-8').strip() + "' ("+ str(max(letters_dict.iteritems(), key=operator.itemgetter(1))[1])+ " " + lw_max + ")"
			
 
				+            json_stats_data.update({"Símbolo más utilizado": "'"+str(max(letters_dict.iteritems(), key=operator.itemgetter(1))[0])+ "' ("+ str(max(letters_dict.iteritems(), key=operator.itemgetter(1))[1])+ " " + lw_max + ")"}) 
			
 
				+            if min(letters_dict.iteritems(), key=operator.itemgetter(1))[1] < 2:
			
 
				+                lw_min = "vez"
			
 
				+            else:
			
 
				+                lw_min = "veces"
			
 
				+            print " [+] Símbolo menos repetido: '"+ str(min(letters_dict.iteritems(), key=operator.itemgetter(1))[0]).encode('utf-8').strip() + "' ("+ str(min(letters_dict.iteritems(), key=operator.itemgetter(1))[1])+ " " + lw_min + ")" + "\n"
			
 
				+            json_stats_data.update({"Símbolo menos repetido": "'"+str(min(letters_dict.iteritems(), key=operator.itemgetter(1))[0])+ "' ("+ str(min(letters_dict.iteritems(), key=operator.itemgetter(1))[1])+ " " + lw_min + ")"}) 
			
 
				+            print " [+] Total palabras:", str(sum(words_dict.values())) + " (diferentes: "+ str(len(words_dict)) + ")"  
			
 
				+            json_stats_data.update({"Total palabras": str(sum(words_dict.values())) + " (diferentes: "+ str(len(words_dict)) + ")" }) 
			
 
				+            if max(words_dict.iteritems(), key=operator.itemgetter(1))[1] < 2:
			
 
				+                lw_max = "vez"
			
 
				+            else:
			
 
				+                lw_max = "veces"
			
 
				+            print " [+] Palabra más repetida: '"+ str(max(words_dict.iteritems(), key=operator.itemgetter(1))[0]).encode('utf-8').strip() + "' ("+ str(max(words_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"
			
 
				+            json_stats_data.update({"Palabra más repetida":  "'"+str(max(words_dict.iteritems(), key=operator.itemgetter(1))[0])+ "' ("+ str(max(words_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"}) 
			
 
				+            if max(words_3_dict.iteritems(), key=operator.itemgetter(1))[1] < 2:
			
 
				+                lw_max = "vez"
			
 
				+            else:
			
 
				+                lw_max = "veces"
			
 
				+            print "      - 3 letras: '"+ str(max(words_3_dict.iteritems(), key=operator.itemgetter(1))[0]).encode('utf-8').strip() + "' ("+ str(max(words_3_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"
			
 
				+            json_stats_data.update({"Palabra más repetida (3 letras)":  "'"+str(max(words_3_dict.iteritems(), key=operator.itemgetter(1))[0])+ "' ("+ str(max(words_3_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"}) 
			
 
				+            if max(words_4_dict.iteritems(), key=operator.itemgetter(1))[1] < 2:
			
 
				+                lw_max = "vez"
			
 
				+            else:
			
 
				+                lw_max = "veces"
			
 
				+            print "      - 4 letras: '"+ str(max(words_4_dict.iteritems(), key=operator.itemgetter(1))[0]).encode('utf-8').strip() + "' ("+ str(max(words_4_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"
			
 
				+            json_stats_data.update({"Palabra más repetida (4 letras)":  "'"+str(max(words_4_dict.iteritems(), key=operator.itemgetter(1))[0])+ "' ("+ str(max(words_4_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"}) 
			
 
				+            if max(words_5_dict.iteritems(), key=operator.itemgetter(1))[1] < 2:
			
 
				+                lw_max = "vez"
			
 
				+            else:
			
 
				+                lw_max = "veces"
			
 
				+            print "      - 5 letras: '"+ str(max(words_5_dict.iteritems(), key=operator.itemgetter(1))[0]).encode('utf-8').strip() + "' ("+ str(max(words_5_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"
			
 
				+            json_stats_data.update({"Palabra más repetida (5 letras)":  "'"+str(max(words_5_dict.iteritems(), key=operator.itemgetter(1))[0])+ "' ("+ str(max(words_5_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"}) 
			
 
				+            if max(words_6_dict.iteritems(), key=operator.itemgetter(1))[1] < 2:
			
 
				+                lw_max = "vez"
			
 
				+            else:
			
 
				+                lw_max = "veces"
			
 
				+            print "      - 6 letras: '"+ str(max(words_6_dict.iteritems(), key=operator.itemgetter(1))[0]).encode('ISO-8859-1').strip() + "' ("+ str(max(words_6_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"
			
 
				+            json_stats_data.update({"Palabra más repetida (6 letras)":  "'"+str(max(words_6_dict.iteritems(), key=operator.itemgetter(1))[0])+ "' ("+ str(max(words_6_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"}) 
			
 
				+            if max(words_7_dict.iteritems(), key=operator.itemgetter(1))[1] < 2:
			
 
				+                lw_max = "vez"
			
 
				+            else:
			
 
				+                lw_max = "veces"
			
 
				+            print "      - 7 letras: '"+ str(max(words_7_dict.iteritems(), key=operator.itemgetter(1))[0]).encode('utf-8').strip() + "' ("+ str(max(words_7_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"
			
 
				+            json_stats_data.update({"Palabra más repetida (7 letras)":  "'"+str(max(words_7_dict.iteritems(), key=operator.itemgetter(1))[0])+ "' ("+ str(max(words_7_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"}) 
			
 
				+            if max(words_8_dict.iteritems(), key=operator.itemgetter(1))[1] < 2:
			
 
				+                lw_max = "vez"
			
 
				+            else:
			
 
				+                lw_max = "veces"
			
 
				+            print "      - 8+ letras: '"+ str(max(words_8_dict.iteritems(), key=operator.itemgetter(1))[0]).encode('utf-8').strip() + "' ("+ str(max(words_8_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")" + "\n"
			
 
				+            json_stats_data.update({"Palabra más repetida (8 letras)":  "'"+str(max(words_8_dict.iteritems(), key=operator.itemgetter(1))[0])+ "' ("+ str(max(words_8_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"}) 
			
 
				+            if min(words_dict.iteritems(), key=operator.itemgetter(1))[1] < 2:
			
 
				+                lw_min = "vez"
			
 
				+            else:
			
 
				+                lw_min = "veces"
			
 
				+            print " [+] Palabra menos repetida: '"+ str(min(words_dict.iteritems(), key=operator.itemgetter(1))[0]).encode('utf-8').strip() + "' ("+ str(min(words_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_min + ")"
			
 
				+            json_stats_data.update({"Palabra menos repetida":  "'"+str(min(words_dict.iteritems(), key=operator.itemgetter(1))[0]) + "' ("+ str(min(words_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_min + ")"}) 
			
 
				+            if max(words_8_dict.iteritems(), key=len)[1] < 2:
			
 
				+                lw_max = "vez"
			
 
				+            else:
			
 
				+                lw_max = "veces"
			
 
				+            print " [+] Palabra más larga y que más se usa: '"+ str(max(words_8_dict.iteritems(), key=len)[0]).encode('ISO-8859-1').strip() + "' ("+ str(max(words_8_dict.iteritems(), key=len)[1]) + " " + lw_max + ")" + "\n"
			
 
				+            json_stats_data.update({"Palabra más larga y que más se usa":  "'"+str(max(words_8_dict.iteritems(), key=len)[0]).encode('ISO-8859-1').strip() + "' ("+ str(max(words_8_dict.iteritems(), key=len)[1]).encode('ISO-8859-1').strip() + " " + lw_max + ")"}) 
			
 
				+            if self.options.checkverbs: 
			
 
				+                verb_flag = False
			
 
				+                num = 0
			
 
				+                print "[Info] Analizando (requiere tiempo!) en busca de: 'verbos infinitivos'...\n"
			
 
				+                for verb in self.verbs:
			
 
				+                    num = num + 1
			
 
				+                    verb_flag = self.check_verb(verb) # re-check previous list of verbs (online!)
			
 
				+                    if verb_flag is True: # is a verb
			
 
				+                        if verb in verbs_dict:
			
 
				+			    verbs_counter = verbs_counter + 1  
			
 
				+                            g = int(verbs_dict[verb]) 
			
 
				+                        else:
			
 
				+                            g = 1 
			
 
				+                        verbs_counter = verbs_counter + g
			
 
				+                        verbs_dict.update({verb:verbs_counter}) 
			
 
				+                        verbs_counter = 0
			
 
				+            if not verbs_dict:
			
 
				+                num_verbs = 0
			
 
				+                dif_verbs = 0
			
 
				+            else:
			
 
				+                num_verbs = str(sum(verbs_dict.values()))
			
 
				+                dif_verbs = str(len(verbs_dict))
			
 
				+            if verbs_dict:
			
 
				+                if self.options.checkverbs:
			
 
				+                    print " [+] Total verbos (infinitivos):", str(self.total_verbs) + " (diferentes: "+ str(dif_verbs) + ")"
			
 
				+                    json_stats_data.update({"Total verbos (infinitivos)": str(self.total_verbs) + " (diferentes: "+ str(dif_verbs) + ")"}) 
			
 
				+                    if max(verbs_dict.iteritems(), key=operator.itemgetter(1))[1] < 2:
			
 
				+                        lw_max = "vez"
			
 
				+                    else:
			
 
				+                        lw_max = "veces"
			
 
				+                    print " [+] Verbo (infinitivo) más utilizado: '"+ str(max(verbs_dict.iteritems(), key=operator.itemgetter(1))[0])+ "' ("+ str(max(verbs_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"
			
 
				+                    json_stats_data.update({"Verbo (infinitivo) más utilizado": str(max(verbs_dict.iteritems(), key=operator.itemgetter(1))[0])+ "' ("+ str(max(verbs_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"}) 
			
 
				+                    if min(verbs_dict.iteritems(), key=operator.itemgetter(1))[1] < 2:
			
 
				+                        lw_min = "vez"
			
 
				+                    else:
			
 
				+                        lw_min = "veces"
			
 
				+                    print " [+] Verbo (infinitivo) menos repetido: '"+ str(min(verbs_dict.iteritems(), key=operator.itemgetter(1))[0]) + "' ("+ str(min(verbs_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_min + ")" + "\n"
			
 
				+                    json_stats_data.update({"Verbo (infinitivo) menos repetido": str(min(verbs_dict.iteritems(), key=operator.itemgetter(1))[0]) + "' ("+ str(min(verbs_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_min + ")"}) 
			
 
				+            for a in self.author_news_stream:
			
 
				+                if a in authors_dict:
			
 
				+		    authors_counter = authors_counter + 1  
			
 
				+                    g = int(authors_dict[a]) 
			
 
				+                else:
			
 
				+                    g = 1 
			
 
				+                authors_counter = authors_counter + g
			
 
				+                authors_dict.update({a:authors_counter}) 
			
 
				+                authors_counter = 0
			
 
				+            print " [+] Total periodistas:", str(len(authors_dict.items()))
			
 
				+            json_stats_data.update({"Total periodistas": str(len(authors_dict.items()))}) 
			
 
				+            print " [+] Noticias por periodista:"
			
 
				+            sep = "/"
			
 
				+            num = 0
			
 
				+            for a, c in sorted(authors_dict.items(), key=lambda x: x[1], reverse=True):
			
 
				+                num = num + 1
			
 
				+                if c < 2:
			
 
				+                    lg = "noticia"
			
 
				+                else:
			
 
				+                    lg = "noticias"
			
 
				+                a = unicode(a)
			
 
				+                a = str(a.encode('utf-8').encode('ISO-8859-1'))
			
 
				+                print "      - '"+str(a).title() + "' ("+ str(c) + " " + lg + ")"
			
 
				+                json_stats_data["Autor(a)-["+str(num)+"]"] = str(a).title() + "' ("+ str(c) + " " + lg + ")"
			
 
				+            json_stats.write(json.dumps(json_stats_data, sort_keys=True, indent=2, separators=(',', ':'), ensure_ascii=False)) # write stats to json file
			
 
				+            json_stats.close() # close .json                   
			
 
				+
			
 
				+    def search(self):
			
 
				+        print "\n[Info] Generando índice de búsqueda...\n"
			
 
				+        all_art_stored = self.count_all_stored()
			
 
				+        if all_art_stored == 0:
			
 
				+            print '-'*25
			
 
				+            print "\n[Info] Necesitas extraer (-e) antes los datos, desde las fuentes.\n" 
			
 
				+            print "[Info] Tienes el almacén vacío. Saliendo...\n" 
			
 
				+            return
			
 
				+        else:
			
 
				+            print "-"*25
			
 
				+        self.generate_data_stream() # generate a 'buffer' stream with all records (using json files)
			
 
				+        term_reply = str(raw_input("\n $ Introduce una palabra (ej: corrupción): "))
			
 
				+        term_reply = " " + term_reply + " " # parse term_reply to use it as a single word
			
 
				+        counter_term = 0
			
 
				+        counter_term_body = 0
			
 
				+        counter_term_entry = 0
			
 
				+        counter_term_title = 0
			
 
				+        titles_stream = []
			
 
				+        for n in self.body_news_stream:
			
 
				+            if term_reply in n:
			
 
				+                counter_term_body = counter_term_body + 1
			
 
				+        for e in self.entry_news_stream:
			
 
				+            if term_reply in e:
			
 
				+                counter_term_entry = counter_term_entry + 1
			
 
				+        for t in self.title_news_stream:
			
 
				+            if term_reply in t:
			
 
				+                counter_term_title = counter_term_title + 1
			
 
				+                titles_stream.append(t)       
			
 
				+        counter_term = counter_term_body + counter_term_entry + counter_term_title
			
 
				+        if counter_term < 2 and counter_term > 0:
			
 
				+            ct = "vez"
			
 
				+        elif counter_term == 0:
			
 
				+            ct = "veces"
			
 
				+        else:
			
 
				+            ct = "veces"
			
 
				+        if counter_term_body < 2 and counter_term_body > 0:
			
 
				+            cb = "artículo"
			
 
				+        elif counter_term_body == 0:
			
 
				+            cb = "artículos"
			
 
				+        else:
			
 
				+            cb = "artículos"
			
 
				+        if counter_term_entry < 2 and counter_term_entry > 0:
			
 
				+            ce = "entrada"
			
 
				+        elif counter_term_entry == 0:
			
 
				+            ce = "entradas"
			
 
				+        else:
			
 
				+            ce = "entradas"
			
 
				+        if counter_term_title < 2 and counter_term_title > 0:
			
 
				+            cl = "titular"
			
 
				+        elif counter_term_title == 0:
			
 
				+            cl = "titulares"
			
 
				+        else:
			
 
				+            cl = "titulares"
			
 
				+        if self.options.tsource:
			
 
				+            print "\n" + '-'*25
			
 
				+            print "\n[Info] Mostrando resultados en:", str(self.options.tsource)
			
 
				+        print "\n [+] Aparece en: ("+str(counter_term_body)+" "+str(cb)+"), ("+str(counter_term_title)+" "+str(cl)+") y ("+str(counter_term_entry)+" "+str(ce)+")"
			
 
				+        print " [+] Sale un total de: ("+str(counter_term)+" "+str(ct)+")" + "\n"
			
 
				+
			
 
				+    def run(self, opts=None):
			
 
				+        options = self.create_options(opts)
			
 
				+        self.set_options(options)
			
 
				+        options = self.options
			
 
				+        if options.update: # update tool
			
 
				+            self.banner()
			
 
				+            try:
			
 
				+                print("\n [Info] Tratando de actualizar a la última versión estable:\n")
			
 
				+                Updater()
			
 
				+            except:
			
 
				+                print("\n [Error] Algo ha ido mal!. Para hacer funcionar ésta opción, deberías clonar Propagare lanzando:\n")
			
 
				+                print("  $ git clone https://github.com/epsylon/propagare\n")
			
 
				+        if options.view_media: # show supported sources
			
 
				+            self.banner()
			
 
				+            print("\n[Info] Listado de fuentes soportadas:\n")
			
 
				+            n = 0
			
 
				+            for m in self.supported_media:
			
 
				+                n = n + 1
			
 
				+                print "    + ["+str(n)+"]:", m
			
 
				+            print "" # zen out
			
 
				+        if options.stats: # show archive stats
			
 
				+            self.banner()
			
 
				+            self.stats()
			
 
				+        if options.term: # start a 'semantic' search for a term
			
 
				+            self.banner()
			
 
				+            self.search()
			
 
				+        if options.news: # extract news (general run)
			
 
				+            try: 
			
 
				+                self.banner()
			
 
				+                print "\n[Info] Buscando las fuentes de datos...\n"
			
 
				+                print "[Info] Examinando el contenido en línea...\n"
			
 
				+                sep='=='
			
 
				+                sep2='?'
			
 
				+                art_url=''
			
 
				+                art_url_list=[]
			
 
				+                art_title=''
			
 
				+                art_author=''
			
 
				+                art_location=''
			
 
				+                num=0
			
 
				+                flag=True
			
 
				+                if options.esource: # user has set a specific source
			
 
				+                    if options.esource in self.supported_media: # source is supported
			
 
				+                        self.supported_media = options.esource
			
 
				+                    else:
			
 
				+                        print "-"*25
			
 
				+                        print('\n[Error] La fuente que has indicado no está soportada! \n')
			
 
				+                        print "-"*25
			
 
				+                        print("\n[Info] Listado de fuentes soportadas :\n")
			
 
				+                        n = 0
			
 
				+                        for m in self.supported_media:
			
 
				+                            n = n + 1
			
 
				+                            print "    + ["+str(n)+"]:", m
			
 
				+                        print "" # zen out
			
 
				+                        return
			
 
				+                self.create_sources_list()
			
 
				+                for n in self.sources: # n = news media source
			
 
				+                    if n.endswith(""):
			
 
				+                        n_url = n.replace("", "/")
			
 
				+                    if not n.startswith("http"):
			
 
				+                        if n == "elmundo.es": # this media only supports access using www
			
 
				+                            n_url = "https://www." + n
			
 
				+                        else:
			
 
				+                            n_url = "https://" + n # SSL only
			
 
				+                    print "- Visitando:", n_url
			
 
				+                    self.user_agent = random.choice(self.agents).strip() # suffle user-agent
			
 
				+                    headers = {'User-Agent' : self.user_agent, 'Referer' : self.referer} # set fake user-agent and referer
			
 
				+                    try:
			
 
				+                        reply = urllib2.urlopen(n_url, context=self.ctx).read()
			
 
				+                    except: 
			
 
				+                        print('\n[Error] - Imposible conectar con: ') + n
			
 
				+                        pass
			
 
				+                    f = open('sources/'+ n)
			
 
				+                    regex = f.readlines()
			
 
				+                    f.close()
			
 
				+                    #print reply # nice to have this output for dev new modules
			
 
				+                    for r in regex: # extract specific keywords from news: time, author, url (+variations), title, description, body
			
 
				+                        if ('art_url==' or 'art_url2==') in r:
			
 
				+                            art_url = r
			
 
				+                            regex_art_url = str(art_url.split(sep, 1)[1]) # regex magics (art_url)
			
 
				+                            pattern_art_url = re.compile(regex_art_url)
			
 
				+                        if 'art_author==' in r:
			
 
				+                            art_author = r
			
 
				+                            regex_art_author = str(art_author.split(sep, 1)[1]) # regex magics (art_author)
			
 
				+                            pattern_art_author = re.compile(regex_art_author)
			
 
				+                        if 'art_time==' in r:
			
 
				+                            art_time = r
			
 
				+                            regex_art_time = str(art_time.split(sep, 1)[1]) # regex magics (art_time)
			
 
				+                            pattern_art_time = re.compile(regex_art_time)
			
 
				+                        if 'art_title==' in r:
			
 
				+                            art_title = r
			
 
				+                            regex_art_title = str(art_title.split(sep, 1)[1]) # regex magics (art_title)
			
 
				+                            pattern_art_title = re.compile(regex_art_title)
			
 
				+                        if 'art_description==' in r:
			
 
				+                            art_description = r
			
 
				+                            regex_art_description = str(art_description.split(sep, 1)[1]) # regex magics (art_description)
			
 
				+                            pattern_art_description = re.compile(regex_art_description)
			
 
				+                        if 'art_body==' in r:
			
 
				+                            art_body = r
			
 
				+                            regex_art_body = str(art_body.split(sep, 1)[1]) # regex magics (art_body)
			
 
				+                            pattern_art_body = re.compile(regex_art_body, re.MULTILINE)
			
 
				+                    try:
			
 
				+                        art_url_found = re.findall(pattern_art_url, reply) # found art_url patterns on main page
			
 
				+                        art_url_parsed = self.check_art_repetitions(n, art_url_found) # discard results previously stored
			
 
				+                    except: 
			
 
				+                        art_url_parsed = None
			
 
				+                        pass
			
 
				+                    art_stored = self.count_art_stored(n)
			
 
				+                    if not art_url_parsed and not art_stored:
			
 
				+                        print "\n[Info] Nuevos artículos encontrados: 0 | Total artículos almacenados (de ésta fuente): 0\n"
			
 
				+                        return
			
 
				+                    if not art_url_parsed and art_stored > 0: # not any new article found + some articles stored 
			
 
				+                        pass
			
 
				+                    elif len(art_url_parsed) is 0 and art_stored is 0: # not any new article found + not any article stored
			
 
				+                        print "\n[Info] Nuevos artículos encontrados: 0 | Total artículos almacenados (de ésta fuente): " + str(art_stored) + "\n"
			
 
				+                        return
			
 
				+                    else: # new article found
			
 
				+                        print "" # zen out
			
 
				+                        for a in art_url_parsed:
			
 
				+                            if "elmundo.es" in n: # re-parsing website: elmundo.es [10/05/2018]
			
 
				+                                if '"' in a:
			
 
				+                                    a = str(a.split('"', 1)[0])
			
 
				+                                if '#' in a:
			
 
				+                                    a = str(a.split('#', 1)[0])
			
 
				+                                if not "http" in a or not "www.elmundo.es" in a or "vivienda" in a or "horoscopo" in a or "menu" in a or "?" in a or "indices" in a or "programacion-tv" in a:
			
 
				+                                    a = None
			
 
				+                            if "eldiario.es" in n: # re-parsing website: eldiario.es [09/05/2018]
			
 
				+                                if '" title="' in a:
			
 
				+                                    a = str(a.split('"', 1)[0])
			
 
				+                                if '">' in a:
			
 
				+                                    a = str(a.split('"', 1)[0])
			
 
				+                                if "rastreador" in a or "http" in a or "autores" in a or "www.eldiario.es" in a or "/carnecruda" in a or "/contenido_patrocinado" in a:
			
 
				+                                    a = None
			
 
				+                            if "elpais.com" in n: # re-parsing website: elpais.com [24/04/2018]
			
 
				+                                if "?" in a:
			
 
				+                                    a = str(a.split(sep2, 1)[0])
			
 
				+                                if "posicionador" in a:
			
 
				+                                    a = a.replace('" class="posicionador',"")
			
 
				+                                if "elpais.com" in a:
			
 
				+                                    a = "https:/" + a
			
 
				+                                else:
			
 
				+                                    a = n_url + "/" + a
			
 
				+                            if a is not None:
			
 
				+                                if "eldiario.es" in n: # re-parsing website: eldiario.es [09/05/2018]
			
 
				+                                    a = "https://eldiario.es" + a
			
 
				+                                if a not in art_url_list:
			
 
				+                                    check_pass = self.check_art_exists(a, n) # check if art found is previously stored for this media
			
 
				+                                    if check_pass is True or check_pass is None:
			
 
				+                                        art_url_list.append(a) # crawlered pages from main website
			
 
				+                                        print "    + [IA]:", a
			
 
				+                        if not art_url_list:
			
 
				+                            pass
			
 
				+                        else:
			
 
				+                            print "\n[Info] Nuevos artículos encontrados: " + str(len(art_url_list)) + " | Total artículos almacenados (de ésta fuente): " + str(art_stored) + "\n"
			
 
				+                            print "- Extrayendo:", n_url
			
 
				+                            if not os.path.exists('data/'):
			
 
				+                                os.makedirs('data/')
			
 
				+                            if not os.path.exists('data/' + n):
			
 
				+                                os.makedirs('data/' + n)
			
 
				+                            for a in art_url_list:
			
 
				+                                num=num+1 # art counter
			
 
				+                                json_data = {} # json dict stream buffer
			
 
				+                                if '"' in a: # re-parse url searching for " after it
			
 
				+                                    sep = '"'
			
 
				+                                    a = str(a.split(sep, 1)[0])
			
 
				+                                print "\n    + ["+str(num)+"/"+str(len(art_url_list))+"] Visitando:", a
			
 
				+                                self.user_agent = random.choice(self.agents).strip() # suffle user-agent
			
 
				+                                headers = {'User-Agent' : self.user_agent, 'Referer' : self.referer} # set user-agent and referer
			
 
				+                                try:
			
 
				+                                    reply_art = urllib2.urlopen(a, context=self.ctx).read()
			
 
				+                                except: 
			
 
				+                                    print('\n[Error] - Imposible conectar con: ') + a
			
 
				+                                    return
			
 
				+                                art_url_author_found = re.findall(pattern_art_author, reply_art) # found art_author pattern on page
			
 
				+                                if not art_url_author_found:
			
 
				+                                    for r in regex: # extract another combination
			
 
				+                                        if 'art_author2==' in r:
			
 
				+                                            art_author = r
			
 
				+                                            try:
			
 
				+                                                regex_art_author = str(art_author.split(sep, 1)[1]) # re-regex magics (art_author)
			
 
				+                                                pattern_art_author = re.compile(regex_art_author)
			
 
				+                                            except:
			
 
				+                                                break
			
 
				+                                    art_url_author_found = re.findall(pattern_art_author, reply_art) # try another art_author pattern on page
			
 
				+                                if not art_url_author_found: # not any author found using regex (use default for each media)
			
 
				+                                    if "elmundo.es" in n: # default author for elmundo.es when not signed
			
 
				+                                        art_url_author_found.append("Ediciones El Mundo")
			
 
				+                                    if "elpais.com" in n: # default author for elpais.com when not signed
			
 
				+                                        art_url_author_found.append("Ediciones El País")
			
 
				+                                    if "eldiario.es" in n: # default author for elpais.com when not signed
			
 
				+                                        art_url_author_found.append("Ediciones El Diario")
			
 
				+                                else:
			
 
				+                                    if "elpais.com" in n: # based on specific reg exp.
			
 
				+                                        sep = '"'
			
 
				+                                        for author in art_url_author_found:
			
 
				+                                            art_url_author_found.remove(author)
			
 
				+                                            author = str(author.split(sep, 1)[0])
			
 
				+                                            art_url_author_found.append(author)
			
 
				+                                art_url_time_found = re.findall(pattern_art_time, reply_art) # found art_time pattern on page
			
 
				+                                art_url_title_found = re.findall(pattern_art_title, reply_art) # found art_title pattern on page
			
 
				+                                art_url_description_found = re.findall(pattern_art_description, reply_art) # found art_description pattern on page 
			
 
				+                                art_url_body_found = re.findall(pattern_art_body, reply_art) # found art_body pattern on page (MULTILIN)
			
 
				+                                if not art_url_body_found: # not any body found
			
 
				+                                    if "eldiario.es" in n:
			
 
				+                                        for r in regex: # extract another combination
			
 
				+                                            if 'art_body2==' in r:
			
 
				+                                                art_body = r
			
 
				+                                                regex_art_body = str(art_body.split(sep, 1)[1])
			
 
				+                                                pattern_art_body = re.compile(regex_art_body)
			
 
				+                                        art_url_body_found = re.findall(pattern_art_body, reply_art) 
			
 
				+                                time.sleep(0.1) # tic, tac!!!
			
 
				+                                self.update_progress("\n      - ETA", num, len(art_url_list))
			
 
				+                                print "" # zen out
			
 
				+                                if "elmundo.es" in a: # [10/05/2018] # schema: https://www.elmundo.es/category/{tag}/date/ID
			
 
				+                                    a_path = a.replace("http://www.elmundo.es/","") # remove pre-url (note: http)
			
 
				+                                    a_path = a_path.replace(".html","") # remove post-url
			
 
				+                                if "elpais.com" in a: # [24/04/2018] # schema: https://elpais.com/category/date/tag/ID
			
 
				+                                    a_path = a.replace("https://elpais.com/","") # remove pre-url
			
 
				+                                    a_path = a_path.split(".html") # remove post-url
			
 
				+                                    if "elpais.com" in a_path: # re-parsing url [24/04/2018] # schema: https://loc.elpais.com/...
			
 
				+                                        a_path = a_path.split("elpais.com/")
			
 
				+                                        a_path = a_path[1]
			
 
				+                                if "eldiario.es" in a: # [08/05/2018] # schema: https://eldiario.es/category/{tag}/ID
			
 
				+                                    a_path = a.replace("https://eldiario.es/","") # remove pre-url
			
 
				+                                    a_path = a_path.replace(".html","") # remove post-url
			
 
				+                                if "/" in a_path: # / mostly used like url-category sep keyword
			
 
				+                                    a_path = a_path.split("/")
			
 
				+                                    if "elmundo.es" in a:
			
 
				+                                        try: # try with /tag/
			
 
				+                                            category = a_path[0]
			
 
				+                                            date_year = a_path[2]
			
 
				+                                            date_month = a_path[3]
			
 
				+                                            date_day = a_path[4]
			
 
				+                                            date = date_year + "_" + date_month + "_" + date_day # date: year/month/day
			
 
				+                                            tag = a_path[1]
			
 
				+                                            ID = a_path[5]
			
 
				+                                            if not os.path.exists('data/'+n+"/"+category):
			
 
				+                                                os.makedirs('data/'+n+"/"+category)
			
 
				+                                            if not os.path.exists('data/'+n+"/"+category+"/"+date):
			
 
				+                                                os.makedirs('data/'+n+"/"+category+"/"+date)
			
 
				+                                            if not os.path.exists('data/'+n+"/"+category+"/"+date+"/"+tag): # create new record
			
 
				+                                                os.makedirs('data/'+n+"/"+category+"/"+date+"/"+tag)
			
 
				+                                            path = 'data/'+n+"/"+category+"/"+date+"/"+tag+"/"+ID+".txt" # set path to file 
			
 
				+                                            self.generate_json(n, category, date, tag, ID) # generate .json  
			
 
				+                                        except:
			
 
				+                                            try:
			
 
				+                                                category = a_path[0]
			
 
				+                                                date_year = a_path[1]
			
 
				+                                                date_month = a_path[2]
			
 
				+                                                date_day = a_path[3]
			
 
				+                                                date = date_year + "_" + date_month + "_" + date_day # date: year/month/day
			
 
				+                                                ID = a_path[4]
			
 
				+                                                if not os.path.exists('data/'+n+"/"+category):
			
 
				+                                                    os.makedirs('data/'+n+"/"+category)
			
 
				+                                                if not os.path.exists('data/'+n+"/"+category+"/"+date):
			
 
				+                                                    os.makedirs('data/'+n+"/"+category+"/"+date)
			
 
				+                                                path = 'data/'+n+"/"+category+"/"+date+"/"+ID+".txt" # set path to file 
			
 
				+                                                self.generate_json(n, category, date, None, ID) # generate .json
			
 
				+                                            except:
			
 
				+                                                pass
			
 
				+                                    if "eldiario.es" in a:
			
 
				+                                        category = a_path[0]
			
 
				+                                        ID = a_path[1]
			
 
				+                                        if not os.path.exists('data/'+n+"/"+category):
			
 
				+                                            os.makedirs('data/'+n+"/"+category)
			
 
				+                                        if not os.path.exists('data/'+n+"/"+category+"/"+ID):
			
 
				+                                            os.makedirs('data/'+n+"/"+category+"/"+ID)
			
 
				+                                        path = 'data/'+n+"/"+category+"/"+ID+"/"+ID+".txt" # set path to file 
			
 
				+                                        self.generate_json(n, category, None, None, ID) # generate .json         
			
 
				+                                    if "elpais.com" in a:
			
 
				+                                        category = a_path[0]
			
 
				+                                        date_year = a_path[1] 
			
 
				+                                        date_month = a_path[2]
			
 
				+                                        date_day = a_path[3] 
			
 
				+                                        date = date_year + "_" + date_month + "_" + date_day # date: year/month/day
			
 
				+                                        tag = a_path[4]
			
 
				+                                        ID = a_path[5]
			
 
				+                                        if not os.path.exists('data/'+n+"/"+category):
			
 
				+                                            os.makedirs('data/'+n+"/"+category)
			
 
				+                                        if not os.path.exists('data/'+n+"/"+category+"/"+date):
			
 
				+                                            os.makedirs('data/'+n+"/"+category+"/"+date)
			
 
				+                                        if not os.path.exists('data/'+n+"/"+category+"/"+date+"/"+tag):
			
 
				+                                            os.makedirs('data/'+n+"/"+category+"/"+date+"/"+tag)
			
 
				+                                        if not os.path.exists('data/'+n+"/"+category+"/"+date+"/"+tag+"/"+ID): # create new record
			
 
				+                                            os.makedirs('data/'+n+"/"+category+"/"+date+"/"+tag+"/"+ID)  
			
 
				+                                        path = 'data/'+n+"/"+category+"/"+date+"/"+tag+"/"+ID+"/"+ID+".txt" # set path to file 
			
 
				+                                        self.generate_json(n, category, date, tag, ID) # generate .json
			
 
				+                                    try:
			
 
				+                                        fs = open(path, "w") # generate .txt
			
 
				+                                    except:
			
 
				+                                        print "\n[Error] No se pueden extraer las noticias de manera correcta. Saliendo...\n"
			
 
				+                                        return
			
 
				+                                    fs.write("Fuente: " + str(a).encode('utf-8') + "\n") # write source url
			
 
				+                                    json_data.update({"Fuente": str(a)})
			
 
				+                                    for t in art_url_time_found:
			
 
				+                                        fs.write("Fecha de publicación: " + str(t).encode('utf-8') + "\n") # write time
			
 
				+                                        json_data.update({"Fecha de publicación": str(t)})
			
 
				+                                    for author in art_url_author_found:
			
 
				+                                        if "\t" in author:
			
 
				+                                            author = author.split('\t', 1)[1] # re-parse for \t
			
 
				+                                        author = str(author.decode('ISO-8859-1').strip())
			
 
				+                                        fs.write("Autor(a): " + str(author).encode('utf-8') + "\n") # write author
			
 
				+                                        json_data.update({"Autor(a)": str(author)})
			
 
				+                                    for title in art_url_title_found:
			
 
				+                                        title = str(title.decode('ISO-8859-1').strip())
			
 
				+                                        parsed = self.format_content(str(title)) 
			
 
				+                                        fs.write("Titular: " + str(parsed).encode('utf-8') + "\n") # write title
			
 
				+                                        json_data.update({"Titular": str(parsed)})
			
 
				+                                    for description in art_url_description_found:
			
 
				+                                        description = str(description.decode('ISO-8859-1').strip())
			
 
				+                                        parsed = self.format_content(str(description))
			
 
				+                                        fs.write("Entrada: " + str(parsed).encode('utf-8') + "\n") # write description
			
 
				+                                        json_data.update({"Entrada": str(parsed)})
			
 
				+                                    body_complete = ""
			
 
				+                                    for body in art_url_body_found:
			
 
				+                                        body = str(body.decode('ISO-8859-1').strip())
			
 
				+                                        if "elmundo.es" in a:
			
 
				+                                            body = body.split("TE PUEDE INTERESAR",1)[0]
			
 
				+                                            if "###" in body:
			
 
				+                                                body = body.relace("###","")
			
 
				+                                        if "elpais.com" in a: 
			
 
				+                                            body = body.replace("<span>Explora nuestras historias</span> por temas","")
			
 
				+                                            body = body.replace("Recibe nuestra newsletter", "")
			
 
				+                                        body_complete += body + "\n\n"
			
 
				+                                        if "elmundo.es" in a:
			
 
				+                                            break
			
 
				+                                    parsed = self.format_content(body_complete) 
			
 
				+                                    fs.write("\n" + str(parsed).encode('utf-8')) # write (plain text) body without keyword
			
 
				+                                    json_data.update({"Noticia": str(parsed)})
			
 
				+                                    self.json_report.write(json.dumps(json_data, sort_keys=True, indent=2, separators=(',', ':'), ensure_ascii=False)) # json dump
			
 
				+                                    fs.close() # close .txt
			
 
				+                                    self.json_report.close() # close .json
			
 
				+                                    self.total_num = self.total_num + 1
			
 
				+                    num = 0 # flush art found counter
			
 
				+                    art_url_list = [] # flush art list
			
 
				+                all_art_stored = self.count_all_stored()
			
 
				+                if self.total_num:
			
 
				+                    print "" # zen out
			
 
				+                else:
			
 
				+                    self.total_num = 0
			
 
				+                print "\n[Info] Nuevos artículos descargados: " + str(self.total_num) + " | Total artículos almacenados (de todas las fuentes): " + str(all_art_stored) + "\n"
			
 
				+                if all_art_stored > 0:  
			
 
				+                    if not self.options.forceno:
			
 
				+                        print "-"*25
			
 
				+                        stats_reply = raw_input("¿Quieres ver las estadísticas comparadas (S/n)?\n")
			
 
				+                    else:
			
 
				+                        stats_reply = "N"
			
 
				+                    if stats_reply == "s" or stats_reply == "S":
			
 
				+                        self.stats()
			
 
				+                    else:
			
 
				+                        print "\n[Info] Saliendo...\n"
			
 
				+                        return
			
 
				+                    if not self.options.forceno:
			
 
				+                        print "-"*25
			
 
				+                        search_reply = raw_input("¿Quieres buscar información semántica (S/n)?\n")
			
 
				+                    else:
			
 
				+                        search_reply = "N"
			
 
				+                    if search_reply == "s" or search_reply == "S":
			
 
				+                        self.search()
			
 
				+                    else:
			
 
				+                        print "\n[Info] Saliendo...\n"
			
 
				+            except (KeyboardInterrupt, SystemExit):
			
 
				+                sys.exit()
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    app = Propagare()
			
 
				+    app.run()
			
--- a/propagare/core/options.py
+++ b/propagare/core/options.py
@@ -0,0 +1,52 @@
 
				+#!/usr/bin/env python 
			
 
				+# -*- coding: utf-8 -*-"
			
 
				+"""
			
 
				+Propagare - 2018 - by psy (epsylon@riseup.net)
			
 
				+
			
 
				+-------
			
 
				+You should have received a copy of the GNU General Public License along
			
 
				+with Propagare; if not, write to the Free Software Foundation, Inc., 51
			
 
				+Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
			
 
				+-------
			
 
				+
			
 
				+"A diferencia del odio, el amor se expande sin necesidad de propaganda..."
			
 
				+
			
 
				+"""
			
 
				+import optparse
			
 
				+
			
 
				+class PropagareOptions(optparse.OptionParser):
			
 
				+    def __init__(self, *args):
			
 
				+        optparse.OptionParser.__init__(self, 
			
 
				+                           description=' Propagar(es): extracción, organización y análisis semántico de noticias.',
			
 
				+                           prog='propagare.py',
			
 
				+			   version='\nPropagare v0.1 - 2018 - GPLv3.0 -> por psy\n',
			
 
				+                           usage= 'propagare [OPTIONS]')
			
 
				+        self.add_option("--update", action="store_true", dest="update", help="actualiza la herramienta a su última versión")
			
 
				+        self.add_option("--list", action="store_true", dest="view_media", help="lista las fuentes de noticias soportadas")
			
 
				+        self.add_option("-e", action="store_true", dest="news", help="extrae noticias de todas las fuentes")
			
 
				+        self.add_option("--es", action="store", dest="esource", help="especifíca una fuente de donde extraer noticias")
			
 
				+        self.add_option("--force-no", action="store_true", dest="forceno", help="utilízalo para almacenar datos (Big Data)")
			
 
				+        self.add_option("-s", action="store_true", dest="stats", help="muestra algunas estadísticas interesantes")
			
 
				+        self.add_option("--ss", action="store", dest="ssource", help="especifíca una fuente de donde extraer estadísticas")
			
 
				+        self.add_option("--check-verbs", action="store_true", dest="checkverbs", help="chequea si una palabra es un verbo (Experimental)")
			
 
				+        self.add_option("-t", action="store_true", dest="term", help="busca el término en el almacén")
			
 
				+        self.add_option("--ts", action="store", dest="tsource", help="busca el término en una fuente determinada")
			
 
				+        #self.add_option("--timer", action="store", dest="timer", help="establece un tiempo para buscar nuevas noticias (en min.)")
			
 
				+
			
 
				+    def get_options(self, user_args=None):
			
 
				+        (options, args) = self.parse_args(user_args)
			
 
				+        options.args = args
			
 
				+        if (not options.news and not options.stats and not options.term and not options.update and not options.view_media):
			
 
				+            print '='*75
			
 
				+            print "  ____                                              _       "
			
 
				+            print " |  _ \ Si no 'contesta'...(es)__ _  __ _ _ __   __| | __ _ "
			
 
				+            print " | |_) | '__/ _ \| '_ \ / _` |/ _` |/ _` | '_ \ / _` |/ _` |"
			
 
				+            print " |  __/| | | (_) | |_) | (_| | (_| | (_| | | | | (_| | (_| |"
			
 
				+            print " |_|   |_|  \___/| .__/ \__,_|\__, |\__,_|_| |_|\__,_|\__,_|"
			
 
				+            print "                 |_|          |___/                (IA)v:0.1"
			
 
				+            print "\n"+'='*75
			
 
				+            print "\n"+self.description
			
 
				+            print "\n"+'='*75
			
 
				+            print "\n [+] Opciones: -h o --help"
			
 
				+            print "\n"+'='*75 + "\n"
			
 
				+        return options
			
--- a/propagare/core/txt/user-agents.txt
+++ b/propagare/core/txt/user-agents.txt
@@ -0,0 +1,95 @@
 
				+Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36
			
 
				+Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36
			
 
				+Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36
			
 
				+Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.1 (KHTML, like Gecko) Chrome/14.0.811.0 Safari/535.1
			
 
				+Mozilla/5.0 (X11; CrOS i686 12.433.109) AppleWebKit/534.30 (KHTML, like Gecko) Chrome/12.0.742.93 Safari/534.30
			
 
				+Mozilla/5.0 (Macintosh; U; Mac OS X 10_6_1; en-US) AppleWebKit/530.5 (KHTML, like Gecko) Chrome/ Safari/530.5
			
 
				+Mozilla/5.0 (Linux; U; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.27 Safari/525.13
			
 
				+Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.1
			
 
				+Mozilla/5.0 (Windows NT 6.1; rv:21.0) Gecko/20130401 Firefox/21.0
			
 
				+Mozilla/6.0 (Windows; U; Windows NT 7.0; en-US; rv:1.9.0.8) Gecko/2009032609 Firefox/3.0.9 (.NET CLR 3.5.30729)
			
 
				+Mozilla/5.0 (X11; U; Linux x86_64; fr; rv:1.9.0.9) Gecko/2009042114 Ubuntu/9.04 (jaunty) Firefox/3.0.9
			
 
				+Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-TW; rv:1.8.0.2) Gecko/20060308 Firefox/1.5.0.2
			
 
				+Mozilla/5.0 (Windows; U; Windows NT5.1; en; rv:1.7.10) Gecko/20050716 Firefox/1.0.5
			
 
				+Mozilla/5.0 (Windows; U; Win98; fr-FR; rv:1.7.6) Gecko/20050226 Firefox/1.0.1
			
 
				+Mozilla/5.0 (X11; U; Linux i686; de-DE; rv:1.6) Gecko/20040207 Firefox/0.8
			
 
				+Mozilla/5.0 (X11) KHTML/4.9.1 (like Gecko) Konqueror/4.9
			
 
				+Mozilla/5.0 (compatible; Konqueror/4.5; FreeBSD) KHTML/4.5.4 (like Gecko)
			
 
				+Mozilla/5.0 (compatible; Konqueror/3.5; NetBSD 4.0_RC3; X11) KHTML/3.5.7 (like Gecko)
			
 
				+Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; AS; rv:11.0) like Gecko
			
 
				+Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko
			
 
				+Mozilla/4.0(compatible; MSIE 7.0b; Windows NT 6.0)
			
 
				+Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 7.0; InfoPath.3; .NET CLR 3.1.40767; Trident/6.0; en-IN)
			
 
				+Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)
			
 
				+Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)
			
 
				+Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/4.0; InfoPath.2; SV1; .NET CLR 2.0.50727; WOW64)
			
 
				+Mozilla/5.0 (compatible; MSIE 10.0; Macintosh; Intel Mac OS X 10_7_3; Trident/6.0)
			
 
				+Mozilla/4.0 (Compatible; MSIE 8.0; Windows NT 5.2; Trident/6.0)
			
 
				+Mozilla/1.22 (compatible; MSIE 10.0; Windows 3.1)
			
 
				+Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0
			
 
				+Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0; yie8)
			
 
				+Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.0; Windows NT 5.1; .NET CLR 1.1.4322; Zango 10.1.181.0)
			
 
				+Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)
			
 
				+Mozilla/4.0 (compatible; MSIE 6.0; AOL 8.0; Windows NT 5.1; SV1)
			
 
				+Mozilla/5.0 (X11; Ubuntu; Linux armv7l; rv:17.0) Gecko/20100101 Firefox/17.0
			
 
				+Mozilla/2.02E (Win95; U)
			
 
				+Mozilla/5.0 (iPhone; U; CPU iOS 2_0 like Mac OS X; en-us)
			
 
				+Mozilla/5.0 (Linux; U; Android 0.5; en-us)
			
 
				+Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)
			
 
				+Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)
			
 
				+Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/525.13 (KHTML, like Gecko) Chrome/0.2.149.29 Safari/525.13
			
 
				+Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)
			
 
				+Mozilla/5.0 (compatible; Yahoo! Slurp; http://help.yahoo.com/help/us/ysearch/slurp)
			
 
				+Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3 (FM Scene 4.6.1)
			
 
				+Mozilla/5.0 (Windows; U; Windows NT 5.1; de; rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3 (.NET CLR 3.5.30729) (Prevx 3.0.5)
			
 
				+Mozilla/5.0 (compatible; Konqueror/4.5; FreeBSD) KHTML/4.5.4 (like Gecko)
			
 
				+Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:5.0) Whistler/20110021 myibrow/5.0.0.0
			
 
				+Mozilla/4.08 [en] (WinNT; I ;Nav)
			
 
				+Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/534.30 (KHTML, like Gecko) Version/4.0 Oupeng/10.2.1.86910 Safari/534.30
			
 
				+Mozilla/5.0 (SMART-TV; Linux; Tizen 2.3) AppleWebkit/538.1 (KHTML, like Gecko) SamsungBrowser/1.0 Safari/538.1
			
 
				+myibrow/2.2 (Windows; U; Windows NT 5.1; cs; rv:1.8.1.14) Gecko/20080001 My Internet Browser/2.2.0.0 20080913235045
			
 
				+Opera/9.25 (Windows NT 6.0; U; en)
			
 
				+Privoxy/1.0
			
 
				+CERN-LineMode/2.15
			
 
				+cg-eye interactive
			
 
				+China Local Browser 2.6
			
 
				+ClariaBot/1.0
			
 
				+Comos/0.9_(robot@xyleme.com)
			
 
				+Crawler@alexa.com
			
 
				+DonutP; Windows98SE
			
 
				+Dr.Web (R) online scanner: http://online.drweb.com/
			
 
				+Dragonfly File Reader
			
 
				+Eurobot/1.0 (http://www.ayell.eu)
			
 
				+FARK.com link verifier
			
 
				+FavIconizer
			
 
				+Feliz - Mixcat Crawler (+http://mixcat.com)
			
 
				+TwitterBot (http://www.twitter.com)
			
 
				+DataCha0s/2.0
			
 
				+EvaalSE - bot@evaal.com
			
 
				+Feedfetcher-Google; (+http://www.google.com/feedfetcher.html)
			
 
				+archive.org_bot
			
 
				+Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36
			
 
				+Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36
			
 
				+Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Safari/602.1.50
			
 
				+Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:49.0) Gecko/20100101 Firefox/49.0
			
 
				+Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36
			
 
				+Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36
			
 
				+Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36
			
 
				+Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_1) AppleWebKit/602.2.14 (KHTML, like Gecko) Version/10.0.1 Safari/602.2.14
			
 
				+Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12) AppleWebKit/602.1.50 (KHTML, like Gecko) Version/10.0 Safari/602.1.50
			
 
				+Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393
			
 
				+Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36
			
 
				+Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36
			
 
				+Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36
			
 
				+Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36
			
 
				+Mozilla/5.0 (Windows NT 10.0; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0
			
 
				+Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36
			
 
				+Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36
			
 
				+Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36
			
 
				+Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.71 Safari/537.36
			
 
				+Mozilla/5.0 (Windows NT 6.1; WOW64; rv:49.0) Gecko/20100101 Firefox/49.0
			
 
				+Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko
			
 
				+Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0
			
 
				+Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36
			
 
				+Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36
			
 
				+Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:49.0) Gecko/20100101 Firefox/49.0
			
--- a/propagare/core/update.py
+++ b/propagare/core/update.py
@@ -0,0 +1,34 @@
 
				+#!/usr/bin/env python 
			
 
				+# -*- coding: utf-8 -*-"
			
 
				+"""
			
 
				+Propagare - 2018 - by psy (epsylon@riseup.net)
			
 
				+
			
 
				+-------
			
 
				+You should have received a copy of the GNU General Public License along
			
 
				+with Propagare; if not, write to the Free Software Foundation, Inc., 51
			
 
				+Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
			
 
				+-------
			
 
				+
			
 
				+"A diferencia del odio, el amor se expande sin necesidad de propaganda."
			
 
				+
			
 
				+"""
			
 
				+import os
			
 
				+from subprocess import PIPE
			
 
				+from subprocess import Popen as execute
			
 
				+        
			
 
				+class Updater(object):   
			
 
				+    def __init__(self):
			
 
				+        GIT_REPOSITORY = "https://github.com/epsylon/propagare"
			
 
				+        rootDir = os.path.abspath(os.path.join(os.path.dirname( __file__ ), '../../', ''))
			
 
				+        if not os.path.exists(os.path.join(rootDir, ".git")):
			
 
				+            print " [Info] No se ha encontrado ningún respositorio: .git!\n"
			
 
				+            print "="*30
			
 
				+            print "\n Para hacer funcionar ésta opción, debes clonar Propagare mediante:\n"
			
 
				+            print " $ git clone %s" % GIT_REPOSITORY + "\n"
			
 
				+        else:
			
 
				+            checkout = execute("git checkout . && git pull", shell=True, stdout=PIPE, stderr=PIPE).communicate()[0]
			
 
				+            print checkout
			
 
				+            if not "Already up-to-date" in checkout:
			
 
				+                print "Enhorabuena!! Propagare se ha actualizado... ;-)\n"
			
 
				+            else:
			
 
				+                print "Propagare no necesita actualización... ;-)\n"
			
--- a/propagare/docs/LEEME.txt
+++ b/propagare/docs/LEEME.txt
@@ -0,0 +1,67 @@
 
				+===========================================================================
			
 
				+  ____                                              _       
			
 
				+ |  _ \ Si no 'contesta'...(es)__ _  __ _ _ __   __| | __ _ 
			
 
				+ | |_) | '__/ _ \| '_ \ / _` |/ _` |/ _` | '_ \ / _` |/ _` |
			
 
				+ |  __/| | | (_) | |_) | (_| | (_| | (_| | | | | (_| | (_| |
			
 
				+ |_|   |_|  \___/| .__/ \__,_|\__, |\__,_|_| |_|\__,_|\__,_|
			
 
				+                 |_|          |___/                (IA)v:0.1
			
 
				+
			
 
				+===========================================================================
			
 
				+
			
 
				+----------
			
 
				+
			
 
				+ + Web:  https://github.com/epsylon/propagare || https://propaganda.is
			
 
				+
			
 
				+===============
			
 
				+  Projecto:
			
 
				+===============
			
 
				+
			
 
				+  Propagare  - es una herramienta de extracción, organización y análisis semántico de noticias.
			
 
				+
			
 
				+===============
			
 
				+  Instalación:
			
 
				+===============
			
 
				+
			
 
				+  Propagare es multiplataforma. Requiere Python (>2.7.9) y las siguientes librerías:
			
 
				+
			
 
				+       python-html2text - Python module for converting HTML to Markdown text
			
 
				+       
			
 
				+  En sistemas basados en Debian (ej: Ubuntu), puedes instalarla mediante: 
			
 
				+
			
 
				+       sudo apt-get install python-html2text
			
 
				+
			
 
				+  O también puedes usar un script de instalación automática (adjunto con la herramienta), ejecutando:
			
 
				+
			
 
				+       python setup.py install
			
 
				+
			
 
				+  Incluso utilizando 'pip':
			
 
				+
			
 
				+       pip install html2text
			
 
				+
			
 
				+  Fuente de las librerías:
			
 
				+
			
 
				+       * Python: https://www.python.org/downloads/
			
 
				+       * Pypi-html2text: https://pypi.org/project/html2text/
			
 
				+
			
 
				+====================
			
 
				+  Opciones:
			
 
				+====================
			
 
				+   
			
 
				+  --version      muestra la versión del programa
			
 
				+  -h, --help     muestra la ayuda
			
 
				+
			
 
				+  --update       actualiza la herramienta a su última versión
			
 
				+
			
 
				+  --list         lista las fuentes de noticias soportadas
			
 
				+
			
 
				+  -e             extrae noticias de todas las fuentes
			
 
				+  --es=ESOURCE   especifíca una fuente de donde extraer noticias
			
 
				+  --force-no     utilízalo para almacenar datos (Big Data)
			
 
				+
			
 
				+  -s             muestra algunas estadísticas interesantes
			
 
				+  --ss=SSOURCE   especifíca una fuente de donde extraer estadísticas
			
 
				+  --check-verbs  chequea si una palabra es un verbo (Experimental)
			
 
				+
			
 
				+  -t             busca el término en el almacén
			
 
				+  --ts=TSOURCE   busca el término en una fuente determinada
			
 
				+
			
--- a/propagare/docs/LICENSE
+++ b/propagare/docs/LICENSE
--- a/propagare/propagare
+++ b/propagare/propagare
@@ -0,0 +1,19 @@
 
				+#!/usr/bin/env python 
			
 
				+# -*- coding: utf-8 -*-"
			
 
				+"""
			
 
				+Propagare - 2018 - by psy (epsylon@riseup.net)
			
 
				+
			
 
				+-------
			
 
				+You should have received a copy of the GNU General Public License along
			
 
				+with Propagare; if not, write to the Free Software Foundation, Inc., 51
			
 
				+Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
			
 
				+-------
			
 
				+
			
 
				+"A diferencia del odio, el amor se expande sin necesidad de propaganda..."
			
 
				+
			
 
				+"""
			
 
				+from core.main import Propagare
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    app = Propagare()
			
 
				+    app.run()
			
--- a/propagare/setup.py
+++ b/propagare/setup.py
@@ -0,0 +1,42 @@
 
				+#!/usr/bin/env python
			
 
				+import sys
			
 
				+if sys.version_info[0] != 2:
			
 
				+    sys.exit("Disculpa, Propagare necesita Python >= 2.7.9") 
			
 
				+from setuptools import setup, find_packages
			
 
				+setup(
			
 
				+    name='propagare',
			
 
				+    version='0.1',
			
 
				+    license='GPLv3',
			
 
				+    author_email='epsylon@riseup.net',
			
 
				+    author='psy',
			
 
				+    description='Extracción, organización y análisis semántico de noticias.',
			
 
				+    url='https://github.com/epsylon/propagare',
			
 
				+    long_description=open('docs/LEEME.txt').read(),
			
 
				+    packages=find_packages(),
			
 
				+    install_requires=['html2text'],
			
 
				+    include_package_data=True,
			
 
				+    package_data={
			
 
				+        'core': ['txt/*.txt'],
			
 
				+    },
			
 
				+    entry_points={
			
 
				+        'console_scripts': [
			
 
				+            'propagare=Propagare:core.main',
			
 
				+        ],
			
 
				+        'gui_scripts': [
			
 
				+            'propagare=Propagare:core.main',
			
 
				+        ],
			
 
				+    },
			
 
				+    keywords='Propaganda Analisis Estadísticas Noticias Periodistas BigData',
			
 
				+    classifiers=[
			
 
				+        'Development Status :: 3 - Alpha',
			
 
				+        "Environment :: Console", 
			
 
				+        "Intended Audience :: System Administrators",
			
 
				+        "Intended Audience :: Science/Research",
			
 
				+        "License :: OSI Approved :: GNU General Public License v3 (GPLv3)",
			
 
				+        "Operating System :: OS Independent",
			
 
				+        "Programming Language :: Python",
			
 
				+        'Programming Language :: Python :: 2.7',
			
 
				+        "Topic :: Internet", 
			
 
				+      ],
			
 
				+      zip_safe=False
			
 
				+)
			
--- a/propagare/sources/eldiario.es
+++ b/propagare/sources/eldiario.es
@@ -0,0 +1,19 @@
 
				+"""
			
 
				+Propagare - 2018 - by psy (epsylon@riseup.net)
			
 
				+
			
 
				+You should have received a copy of the GNU General Public License along
			
 
				+with Propagare; if not, write to the Free Software Foundation, Inc., 51
			
 
				+Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
			
 
				+----------------
			
 
				+
			
 
				+Regex magic keywords to extract data from: eldiario.es
			
 
				+
			
 
				+"""
			
 
				+art_url==<a class="lnk" href="(.+?)
			
 
				+art_author==<a href="/autores/(.+?)/"
			
 
				+art_author2==red_n': '(.+?)' 
			
 
				+art_time=="dateCreated" : "(.+?)",
			
 
				+art_title==og:title" content="(.+?)/>
			
 
				+art_description==og:description" content="(.+?)/>
			
 
				+art_body==<p class="mce">(.+?)</p>
			
 
				+art_body2==<p class="mce"> (.+?)
			
--- a/propagare/sources/elmundo.es
+++ b/propagare/sources/elmundo.es
@@ -0,0 +1,18 @@
 
				+"""
			
 
				+Propagare - 2018 - by psy (epsylon@riseup.net)
			
 
				+
			
 
				+You should have received a copy of the GNU General Public License along
			
 
				+with Propagare; if not, write to the Free Software Foundation, Inc., 51
			
 
				+Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
			
 
				+----------------
			
 
				+
			
 
				+Regex magic keywords to extract data from: elmundo.es
			
 
				+
			
 
				+"""
			
 
				+art_url==<a href="(.+?)
			
 
				+art_author==<span itemprop="name">(.+?)</span>
			
 
				+art_author2==s.prop75="(.+?)";
			
 
				+art_time=="DC.date.issued" content="(.+?)">
			
 
				+art_title==og:title" content="(.+?)"/>
			
 
				+art_description==og:description" content="(.+?)"/>
			
 
				+art_body==<p>(.+?)
			
--- a/propagare/sources/elpais.com
+++ b/propagare/sources/elpais.com
@@ -0,0 +1,19 @@
 
				+"""
			
 
				+Propagare - 2018 - by psy (epsylon@riseup.net)
			
 
				+
			
 
				+You should have received a copy of the GNU General Public License along
			
 
				+with Propagare; if not, write to the Free Software Foundation, Inc., 51
			
 
				+Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
			
 
				+----------------
			
 
				+
			
 
				+Regex magic keywords to extract data from: elpais.com
			
 
				+
			
 
				+"""
			
 
				+art_url==<a href="/(.+?)">
			
 
				+art_url2==<a href="/(.+?)" >
			
 
				+art_author==<meta name="author" 		content="(.+?)
			
 
				+art_author2=="author" title="Ver todas las noticias de (.+?)
			
 
				+art_time==<meta property="article:modified_time" content="(.+?)" />
			
 
				+art_title==<meta property="og:title" content="(.+?)"/>
			
 
				+art_description==<meta property="og:description" content="(.+?)"/>
			
 
				+art_body==<p>(.+?)</p>