main.py 55 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-"
  3. """
  4. Propagare - 2018 - by psy (epsylon@riseup.net)
  5. -------
  6. You should have received a copy of the GNU General Public License along
  7. with Propagare; if not, write to the Free Software Foundation, Inc., 51
  8. Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  9. -------
  10. "A diferencia del odio, el amor se expande sin necesidad de propaganda."
  11. """
  12. import os, sys, urllib2, ssl, random, re, time, string
  13. import json, operator, io
  14. try:
  15. import html2text
  16. except:
  17. print "\n[Error] - No se encuentra la librería: html2text. \n\n Puedes instalarla ejecutando en la terminal:\n\n sudo apt-get install python-html2text\n\n Después, prueba a lanzar de nuevo la aplicación.\n"
  18. sys.exit(2)
  19. from core.options import PropagareOptions
  20. from core.update import Updater
  21. reload(sys)
  22. sys.setdefaultencoding('utf-8') # for decode byte (black magic!)
  23. DEBUG = False
  24. class Propagare(object):
  25. def __init__(self):
  26. self.supported_media = ["elpais.com", "eldiario.es", "elmundo.es"] # media sources modules currently implemented
  27. self.check_verb_online = "https://www.esfacil.eu/es/verbos/conjugacion.html?infinitive=" # for check spanish verbs online
  28. self.sources = [] # used for news media sources
  29. self.agents_file = 'core/txt/user-agents.txt' # set source path to retrieve user-agents
  30. self.referer = 'http://127.0.0.1/' # set referer
  31. self.agents = []
  32. f = open(self.agents_file)
  33. agents = f.readlines()
  34. f.close()
  35. for agent in agents:
  36. self.agents.append(agent) # agents are random each request
  37. self.ctx = ssl.create_default_context() # creating context to bypass SSL cert validation (black magic!)
  38. self.ctx.check_hostname = False
  39. self.ctx.verify_mode = ssl.CERT_NONE
  40. self.jdata = [] # json stream container (a list)
  41. self.verbs = [] # a list to store semantics (verbs)
  42. self.total_verbs = 0
  43. self.total_num = 0
  44. def set_options(self, options):
  45. self.options = options
  46. def create_options(self, args=None):
  47. self.optionParser = PropagareOptions()
  48. self.options = self.optionParser.get_options(args)
  49. if not self.options:
  50. return False
  51. return self.options
  52. def remove_punctuation(self, news):
  53. p = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
  54. no_p = ""
  55. for c in news:
  56. if c not in p:
  57. no_p = no_p + c
  58. news = no_p
  59. return news
  60. def update_progress(self, t, p, l):
  61. b = p*100/l
  62. msg = "\r{0}: [{1}] {2}%".format(t, "#"*p + "-"*(l-p), round(b,2))
  63. if p >= 100: msg += " DONE\r\n"
  64. sys.stdout.write(msg)
  65. sys.stdout.flush()
  66. def banner(self):
  67. print '='*75
  68. print " ____ _ "
  69. print " | _ \ Si no 'contesta'...(es)__ _ __ _ _ __ __| | __ _ "
  70. print " | |_) | '__/ _ \| '_ \ / _` |/ _` |/ _` | '_ \ / _` |/ _` |"
  71. print " | __/| | | (_) | |_) | (_| | (_| | (_| | | | | (_| | (_| |"
  72. print " |_| |_| \___/| .__/ \__,_|\__, |\__,_|_| |_|\__,_|\__,_|"
  73. print " |_| |___/ (IA)v:0.1"
  74. print "\n"+'='*75
  75. print "\n"+self.optionParser.description
  76. print "\n"+'='*75
  77. @classmethod
  78. def try_running(cls, func, error, args=None):
  79. args = args or []
  80. try:
  81. return func(*args)
  82. except Exception:
  83. print(error, "error")
  84. if DEBUG == True:
  85. traceback.print_exc()
  86. def generate_json(self, n, category, date, tag, ID):
  87. if "elpais.com" in n:
  88. self.json_report = open('data/'+n+"/"+category+"/"+date+"/"+tag+"/"+ID+"/"+ID+".json", "w")
  89. if "eldiario.es" in n:
  90. self.json_report = open('data/'+n+"/"+category+"/"+ID+"/"+ID+".json", "w")
  91. if "elmundo.es" in n:
  92. if tag is None:
  93. self.json_report = open('data/'+n+"/"+category+"/"+date+"/"+ID+".json", "w")
  94. else:
  95. self.json_report = open('data/'+n+"/"+category+"/"+date+"/"+tag+"/"+ID+".json", "w")
  96. def format_content(self, data):
  97. html_parser = html2text.HTML2Text()
  98. html_parser.ignore_links = True
  99. html_parser.ignore_images = True
  100. html_parser.ignore_emphasis = True
  101. html_parser.bypass_tables = True
  102. html_parser.unicode_snob = True
  103. html_parser.skip_internal_links = True
  104. parsed = html_parser.handle(data)
  105. parsed = parsed.replace("\n", " ") # clean \n
  106. parsed = parsed.replace('\"', " ") # clean \"
  107. parsed = parsed.replace("#","") # clean #
  108. return parsed
  109. def count_all_stored(self): # all sources
  110. art_stored = 0
  111. for root, dirs, files in os.walk('data/'):
  112. art_stored += len(files)
  113. return (art_stored-1)/2 # ((txt+json)-last_log)/2
  114. def count_art_stored(self, n): # by source
  115. art_stored = 0
  116. for root, dirs, files in os.walk('data/' + n):
  117. art_stored += len(files)
  118. return art_stored/2 # (txt+json)/2
  119. def count_sources_list(self):
  120. media_stored = 0
  121. for root, dirs, files in os.walk("data/"):
  122. for d in dirs:
  123. if d in self.supported_media:
  124. media_stored = media_stored + 1
  125. return media_stored
  126. def check_art_exists(self, a, n):
  127. sep = ".json"
  128. for root, dirs, files in os.walk('data/' + n):
  129. for f in files:
  130. if f.endswith(".json"):
  131. f = str(f.split(sep, 1)[0])
  132. if str(f) in str(a):
  133. check_pass = False
  134. return check_pass
  135. else:
  136. check_pass = True
  137. def create_sources_list(self):
  138. for root, dirs, files in os.walk("sources/", topdown=False):
  139. for name in files:
  140. if name not in self.sources and name in self.supported_media:
  141. self.sources.append(name) # add name to sources list
  142. def check_art_repetitions(self, n, art_url_found):
  143. sep = "/"
  144. sep2 = ".html"
  145. sep3 = "."
  146. sep4 = "_"
  147. filenames = []
  148. for root, dirs, files in os.walk('data/' + n):
  149. for f in files:
  150. filename = os.path.basename(f)
  151. filename = str(filename.split(sep3, 1)[0])
  152. if not filename in filenames:
  153. filenames.append(filename)
  154. if filenames:
  155. for f in filenames:
  156. for a in art_url_found:
  157. if "eldiario.es" in n:
  158. if str(f) in str(a):
  159. art_url_found.remove(a)
  160. if "elpais.com" in n:
  161. f = str(f.split(sep3, 1)[0])
  162. ID = str(a.split(sep, 5)[5])
  163. ID = str(ID.split(sep2, 1)[0])
  164. if sep in ID:
  165. ID = str(ID.split(sep, 2)[2])
  166. if str(ID) in str(f): # art stored, discard it
  167. art_url_found.remove(a)
  168. return art_url_found
  169. def is_a_verb(self, w):
  170. if w.endswith("ar") or w.endswith("er") or w.endswith("ir"): # (spanish: inifitive verb)
  171. self.total_verbs = self.total_verbs + 1
  172. self.verbs.append(w) # add verb to list
  173. def check_verb(self, verb):
  174. check_verb_online = str(self.check_verb_online) + verb # url + verb
  175. self.user_agent = random.choice(self.agents).strip() # suffle user-agent
  176. headers = {'User-Agent' : self.user_agent, 'Referer' : self.referer} # set fake user-agent and referer
  177. try:
  178. reply = urllib2.urlopen(check_verb_online, context=self.ctx).read()
  179. except:
  180. print('\n[Error] - Imposible conectar con: ') + check_verb_online + '\n'
  181. return False
  182. if "¡Verbo no válido!" in reply or "¡Verbo no encontrado!" in reply: # working at: 30/04/2018
  183. return False
  184. else:
  185. return True
  186. def generate_data_stream(self):
  187. if self.options.ssource:
  188. if self.options.ssource in self.supported_media: # source is supported
  189. source = 'data/' + self.options.ssource
  190. else:
  191. print "-"*25
  192. print('\n[Error] La fuente indicada de estadísticas no está soportada! \n')
  193. print "-"*25
  194. print("\n[Info] Listado de fuentes soportadas :\n")
  195. n = 0
  196. for m in self.supported_media:
  197. n = n + 1
  198. print " + ["+str(n)+"]:", m
  199. print "" # zen out
  200. sys.exit(2)
  201. else:
  202. source = 'data/'
  203. if self.options.tsource:
  204. if self.options.tsource in self.supported_media: # source is supported
  205. source = 'data/' + self.options.tsource
  206. else:
  207. print('\n[Error] La fuente indicada de búsqueda no está soportada! \n')
  208. print "-"*25
  209. print("\n[Info] Listado de fuentes soportadas :\n")
  210. n = 0
  211. for m in self.supported_media:
  212. n = n + 1
  213. print " + ["+str(n)+"]:", m
  214. print "" # zen out
  215. sys.exit(2)
  216. else:
  217. if not self.options.ssource:
  218. source = 'data/'
  219. for root, dirs, files in os.walk(source): # generate stream for analisis
  220. for fl in files:
  221. if fl.endswith(".json"): # extract content from json archives
  222. p=os.path.join(root,fl)
  223. kf = io.open(os.path.abspath(p), encoding='utf-8')
  224. try:
  225. data = str(kf.read().encode('utf-8'))
  226. except:
  227. data = kf.read()
  228. try:
  229. self.jdata.append(json.loads(data))
  230. except:
  231. pass
  232. kf.close()
  233. if not self.jdata:
  234. print "\n[Info] Necesitas extraer (-e) antes los datos, desde las fuentes.\n"
  235. print "[Info] Tienes el almacén vacío. Saliendo...\n"
  236. sys.exit(2) # return
  237. self.body_news_stream = []
  238. self.dates_news_stream = []
  239. self.author_news_stream = []
  240. self.entry_news_stream = []
  241. self.title_news_stream = []
  242. #self.url_news_stream = []
  243. for record in self.jdata:
  244. for key, value in record.iteritems(): # unpack a 'huge-stream' dict, stored on a list (a tuple) using iteritems() (black magic!)
  245. if key == "Noticia": # parse only for body content
  246. self.body_news_stream.append(value)
  247. if key == "Fecha de publicación": # parse only for dates
  248. self.dates_news_stream.append(value)
  249. if key == "Autor(a)": # parse only for authors
  250. self.author_news_stream.append(value)
  251. if self.options.term: # extract more keys when searching a term
  252. if key == "Entrada": # parse only for entry content
  253. self.entry_news_stream.append(value)
  254. if key == "Titular": # parse only for title
  255. self.title_news_stream.append(value)
  256. #if key == "Fuente": # parse only for url source
  257. # self.url_news_stream.append(value)
  258. def stats(self):
  259. print "\n[Info] Recopilando estadísticas del almacén...\n"
  260. all_art_stored = self.count_all_stored()
  261. if all_art_stored == 0 or all_art_stored < 0:
  262. print '-'*25
  263. print "\n[Info] Necesitas extraer (-e) antes los datos, desde las fuentes.\n"
  264. print "[Info] Tienes el almacén vacío. Saliendo...\n"
  265. return
  266. else:
  267. print "-"*25
  268. json_stats = open('data/last_stats.json', "w") # generate json with last stats
  269. json_stats_data = {}
  270. self.generate_data_stream() # generate a 'buffer' stream with records (using json files)
  271. self.create_sources_list()
  272. media_sources = self.count_sources_list()
  273. print "\n [+] Total medios:", str(media_sources)
  274. print " [+] Total noticias:", str(all_art_stored) + "\n"
  275. json_stats_data.update({"Total medios": str(media_sources)})
  276. json_stats_data.update({"Total noticias": str(all_art_stored)})
  277. news_letters = 0
  278. symbols = []
  279. letters_dict = {}
  280. words_dict = {}
  281. words_3_dict = {}
  282. words_4_dict = {}
  283. words_5_dict = {}
  284. words_6_dict = {}
  285. words_7_dict = {}
  286. words_8_dict = {}
  287. verbs_dict = {}
  288. authors_dict = {}
  289. letters_counter = 0
  290. words_counter = 0
  291. verbs_counter = 0
  292. authors_counter = 0
  293. for news in self.body_news_stream:
  294. news_parsed = self.remove_punctuation(str(news)) # remove punctuation signs / encode from unicode to str
  295. news_parsed_noblank = news_parsed
  296. news_parsed_noblank.replace(" ","") # data as a stream without blank spaces
  297. news_parsed_noblank = news_parsed_noblank.lower() # change stream to lowercase
  298. news_letters = news_letters + len(news_parsed_noblank)
  299. news_split = news_parsed.split()
  300. nums = "0123456789" # 0-9
  301. symbols = map(chr, range(97, 123)) # a-z (A-Z)
  302. for char in nums:
  303. symbols.append(char)
  304. for l in symbols:
  305. if l in news_parsed_noblank: # only count those letters that exists
  306. letters_counter = int(letters_counter + news_parsed_noblank.count(l))
  307. if l in letters_dict.iteritems():
  308. g = int(letters_dict[l]) # extract previous value
  309. else:
  310. g = 0
  311. lg = letters_counter + g # sum new letters counted to previous ones
  312. letters_dict.update({l:lg}) # update dict with new value
  313. letters_counter = 0 # flush counter
  314. for w in news_split:
  315. w = w.lower() # change word to lowercase
  316. if w in words_dict:
  317. words_counter = words_counter + 1
  318. g = int(words_dict[w])
  319. else:
  320. g = 1
  321. words_counter = words_counter + g
  322. words_dict.update({w:words_counter})
  323. words_counter = 0
  324. if self.options.checkverbs:
  325. self.is_a_verb(w) # check for verbs (adding to a list) using semantic rules
  326. for key, value in words_dict.iteritems():
  327. if len(key) == 3:
  328. words_3_dict[key] = value
  329. if len(key) == 4:
  330. words_4_dict[key] = value
  331. if len(key) == 5:
  332. words_5_dict[key] = value
  333. if len(key) == 6:
  334. words_6_dict[key] = value
  335. if len(key) == 7:
  336. words_7_dict[key] = value
  337. if len(key) > 7:
  338. words_8_dict[key] = value
  339. if self.options.ssource:
  340. print '-'*25
  341. print "\n[Info] Mostrando estadísticas para:", str(self.options.ssource) + "\n"
  342. print " [+] Noticia más antigua:",str(min(self.dates_news_stream))
  343. print " [+] Noticia más reciente:",str(max(self.dates_news_stream)) + "\n"
  344. json_stats_data.update({"Noticia más antigua": str(min(self.dates_news_stream))})
  345. json_stats_data.update({"Noticia más reciente": str(max(self.dates_news_stream))})
  346. print " [+] Total símbolos (a-Z/0-9):", str(news_letters) + " (diferentes: "+ str(len(letters_dict)) + ")"
  347. json_stats_data.update({"Total símbolos (a-Z/0-9)": str(news_letters) + " (diferentes: "+ str(len(letters_dict)) + ")" })
  348. if max(letters_dict.iteritems(), key=operator.itemgetter(1))[1] < 2:
  349. lw_max = "vez"
  350. else:
  351. lw_max = "veces"
  352. print " [+] Símbolo más utilizado: '"+str(max(letters_dict.iteritems(), key=operator.itemgetter(1))[0]).encode('utf-8').strip() + "' ("+ str(max(letters_dict.iteritems(), key=operator.itemgetter(1))[1])+ " " + lw_max + ")"
  353. json_stats_data.update({"Símbolo más utilizado": "'"+str(max(letters_dict.iteritems(), key=operator.itemgetter(1))[0])+ "' ("+ str(max(letters_dict.iteritems(), key=operator.itemgetter(1))[1])+ " " + lw_max + ")"})
  354. if min(letters_dict.iteritems(), key=operator.itemgetter(1))[1] < 2:
  355. lw_min = "vez"
  356. else:
  357. lw_min = "veces"
  358. print " [+] Símbolo menos repetido: '"+ str(min(letters_dict.iteritems(), key=operator.itemgetter(1))[0]).encode('utf-8').strip() + "' ("+ str(min(letters_dict.iteritems(), key=operator.itemgetter(1))[1])+ " " + lw_min + ")" + "\n"
  359. json_stats_data.update({"Símbolo menos repetido": "'"+str(min(letters_dict.iteritems(), key=operator.itemgetter(1))[0])+ "' ("+ str(min(letters_dict.iteritems(), key=operator.itemgetter(1))[1])+ " " + lw_min + ")"})
  360. print " [+] Total palabras:", str(sum(words_dict.values())) + " (diferentes: "+ str(len(words_dict)) + ")"
  361. json_stats_data.update({"Total palabras": str(sum(words_dict.values())) + " (diferentes: "+ str(len(words_dict)) + ")" })
  362. if max(words_dict.iteritems(), key=operator.itemgetter(1))[1] < 2:
  363. lw_max = "vez"
  364. else:
  365. lw_max = "veces"
  366. print " [+] Palabra más repetida: '"+ str(max(words_dict.iteritems(), key=operator.itemgetter(1))[0]).encode('utf-8').strip() + "' ("+ str(max(words_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"
  367. json_stats_data.update({"Palabra más repetida": "'"+str(max(words_dict.iteritems(), key=operator.itemgetter(1))[0])+ "' ("+ str(max(words_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"})
  368. if max(words_3_dict.iteritems(), key=operator.itemgetter(1))[1] < 2:
  369. lw_max = "vez"
  370. else:
  371. lw_max = "veces"
  372. print " - 3 letras: '"+ str(max(words_3_dict.iteritems(), key=operator.itemgetter(1))[0]).encode('utf-8').strip() + "' ("+ str(max(words_3_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"
  373. json_stats_data.update({"Palabra más repetida (3 letras)": "'"+str(max(words_3_dict.iteritems(), key=operator.itemgetter(1))[0])+ "' ("+ str(max(words_3_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"})
  374. if max(words_4_dict.iteritems(), key=operator.itemgetter(1))[1] < 2:
  375. lw_max = "vez"
  376. else:
  377. lw_max = "veces"
  378. print " - 4 letras: '"+ str(max(words_4_dict.iteritems(), key=operator.itemgetter(1))[0]).encode('utf-8').strip() + "' ("+ str(max(words_4_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"
  379. json_stats_data.update({"Palabra más repetida (4 letras)": "'"+str(max(words_4_dict.iteritems(), key=operator.itemgetter(1))[0])+ "' ("+ str(max(words_4_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"})
  380. if max(words_5_dict.iteritems(), key=operator.itemgetter(1))[1] < 2:
  381. lw_max = "vez"
  382. else:
  383. lw_max = "veces"
  384. print " - 5 letras: '"+ str(max(words_5_dict.iteritems(), key=operator.itemgetter(1))[0]).encode('utf-8').strip() + "' ("+ str(max(words_5_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"
  385. json_stats_data.update({"Palabra más repetida (5 letras)": "'"+str(max(words_5_dict.iteritems(), key=operator.itemgetter(1))[0])+ "' ("+ str(max(words_5_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"})
  386. if max(words_6_dict.iteritems(), key=operator.itemgetter(1))[1] < 2:
  387. lw_max = "vez"
  388. else:
  389. lw_max = "veces"
  390. print " - 6 letras: '"+ str(max(words_6_dict.iteritems(), key=operator.itemgetter(1))[0]).encode('ISO-8859-1').strip() + "' ("+ str(max(words_6_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"
  391. json_stats_data.update({"Palabra más repetida (6 letras)": "'"+str(max(words_6_dict.iteritems(), key=operator.itemgetter(1))[0])+ "' ("+ str(max(words_6_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"})
  392. if max(words_7_dict.iteritems(), key=operator.itemgetter(1))[1] < 2:
  393. lw_max = "vez"
  394. else:
  395. lw_max = "veces"
  396. print " - 7 letras: '"+ str(max(words_7_dict.iteritems(), key=operator.itemgetter(1))[0]).encode('utf-8').strip() + "' ("+ str(max(words_7_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"
  397. json_stats_data.update({"Palabra más repetida (7 letras)": "'"+str(max(words_7_dict.iteritems(), key=operator.itemgetter(1))[0])+ "' ("+ str(max(words_7_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"})
  398. if max(words_8_dict.iteritems(), key=operator.itemgetter(1))[1] < 2:
  399. lw_max = "vez"
  400. else:
  401. lw_max = "veces"
  402. print " - 8+ letras: '"+ str(max(words_8_dict.iteritems(), key=operator.itemgetter(1))[0]).encode('utf-8').strip() + "' ("+ str(max(words_8_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")" + "\n"
  403. json_stats_data.update({"Palabra más repetida (8 letras)": "'"+str(max(words_8_dict.iteritems(), key=operator.itemgetter(1))[0])+ "' ("+ str(max(words_8_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"})
  404. if min(words_dict.iteritems(), key=operator.itemgetter(1))[1] < 2:
  405. lw_min = "vez"
  406. else:
  407. lw_min = "veces"
  408. print " [+] Palabra menos repetida: '"+ str(min(words_dict.iteritems(), key=operator.itemgetter(1))[0]).encode('utf-8').strip() + "' ("+ str(min(words_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_min + ")"
  409. json_stats_data.update({"Palabra menos repetida": "'"+str(min(words_dict.iteritems(), key=operator.itemgetter(1))[0]) + "' ("+ str(min(words_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_min + ")"})
  410. if max(words_8_dict.iteritems(), key=len)[1] < 2:
  411. lw_max = "vez"
  412. else:
  413. lw_max = "veces"
  414. print " [+] Palabra más larga y que más se usa: '"+ str(max(words_8_dict.iteritems(), key=len)[0]).encode('ISO-8859-1').strip() + "' ("+ str(max(words_8_dict.iteritems(), key=len)[1]) + " " + lw_max + ")" + "\n"
  415. json_stats_data.update({"Palabra más larga y que más se usa": "'"+str(max(words_8_dict.iteritems(), key=len)[0]).encode('ISO-8859-1').strip() + "' ("+ str(max(words_8_dict.iteritems(), key=len)[1]).encode('ISO-8859-1').strip() + " " + lw_max + ")"})
  416. if self.options.checkverbs:
  417. verb_flag = False
  418. num = 0
  419. print "[Info] Analizando (requiere tiempo!) en busca de: 'verbos infinitivos'...\n"
  420. for verb in self.verbs:
  421. num = num + 1
  422. verb_flag = self.check_verb(verb) # re-check previous list of verbs (online!)
  423. if verb_flag is True: # is a verb
  424. if verb in verbs_dict:
  425. verbs_counter = verbs_counter + 1
  426. g = int(verbs_dict[verb])
  427. else:
  428. g = 1
  429. verbs_counter = verbs_counter + g
  430. verbs_dict.update({verb:verbs_counter})
  431. verbs_counter = 0
  432. if not verbs_dict:
  433. num_verbs = 0
  434. dif_verbs = 0
  435. else:
  436. num_verbs = str(sum(verbs_dict.values()))
  437. dif_verbs = str(len(verbs_dict))
  438. if verbs_dict:
  439. if self.options.checkverbs:
  440. print " [+] Total verbos (infinitivos):", str(self.total_verbs) + " (diferentes: "+ str(dif_verbs) + ")"
  441. json_stats_data.update({"Total verbos (infinitivos)": str(self.total_verbs) + " (diferentes: "+ str(dif_verbs) + ")"})
  442. if max(verbs_dict.iteritems(), key=operator.itemgetter(1))[1] < 2:
  443. lw_max = "vez"
  444. else:
  445. lw_max = "veces"
  446. print " [+] Verbo (infinitivo) más utilizado: '"+ str(max(verbs_dict.iteritems(), key=operator.itemgetter(1))[0])+ "' ("+ str(max(verbs_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"
  447. json_stats_data.update({"Verbo (infinitivo) más utilizado": str(max(verbs_dict.iteritems(), key=operator.itemgetter(1))[0])+ "' ("+ str(max(verbs_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_max + ")"})
  448. if min(verbs_dict.iteritems(), key=operator.itemgetter(1))[1] < 2:
  449. lw_min = "vez"
  450. else:
  451. lw_min = "veces"
  452. print " [+] Verbo (infinitivo) menos repetido: '"+ str(min(verbs_dict.iteritems(), key=operator.itemgetter(1))[0]) + "' ("+ str(min(verbs_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_min + ")" + "\n"
  453. json_stats_data.update({"Verbo (infinitivo) menos repetido": str(min(verbs_dict.iteritems(), key=operator.itemgetter(1))[0]) + "' ("+ str(min(verbs_dict.iteritems(), key=operator.itemgetter(1))[1]) + " " + lw_min + ")"})
  454. for a in self.author_news_stream:
  455. if a in authors_dict:
  456. authors_counter = authors_counter + 1
  457. g = int(authors_dict[a])
  458. else:
  459. g = 1
  460. authors_counter = authors_counter + g
  461. authors_dict.update({a:authors_counter})
  462. authors_counter = 0
  463. print " [+] Total periodistas:", str(len(authors_dict.items()))
  464. json_stats_data.update({"Total periodistas": str(len(authors_dict.items()))})
  465. print " [+] Noticias por periodista:"
  466. sep = "/"
  467. num = 0
  468. for a, c in sorted(authors_dict.items(), key=lambda x: x[1], reverse=True):
  469. num = num + 1
  470. if c < 2:
  471. lg = "noticia"
  472. else:
  473. lg = "noticias"
  474. a = unicode(a)
  475. a = str(a.encode('utf-8').encode('ISO-8859-1'))
  476. print " - '"+str(a).title() + "' ("+ str(c) + " " + lg + ")"
  477. json_stats_data["Autor(a)-["+str(num)+"]"] = str(a).title() + "' ("+ str(c) + " " + lg + ")"
  478. json_stats.write(json.dumps(json_stats_data, sort_keys=True, indent=2, separators=(',', ':'), ensure_ascii=False)) # write stats to json file
  479. json_stats.close() # close .json
  480. def search(self):
  481. print "\n[Info] Generando índice de búsqueda...\n"
  482. all_art_stored = self.count_all_stored()
  483. if all_art_stored == 0:
  484. print '-'*25
  485. print "\n[Info] Necesitas extraer (-e) antes los datos, desde las fuentes.\n"
  486. print "[Info] Tienes el almacén vacío. Saliendo...\n"
  487. return
  488. else:
  489. print "-"*25
  490. self.generate_data_stream() # generate a 'buffer' stream with all records (using json files)
  491. term_reply = str(raw_input("\n $ Introduce una palabra (ej: corrupción): "))
  492. term_reply = " " + term_reply + " " # parse term_reply to use it as a single word
  493. counter_term = 0
  494. counter_term_body = 0
  495. counter_term_entry = 0
  496. counter_term_title = 0
  497. titles_stream = []
  498. for n in self.body_news_stream:
  499. if term_reply in n:
  500. counter_term_body = counter_term_body + 1
  501. for e in self.entry_news_stream:
  502. if term_reply in e:
  503. counter_term_entry = counter_term_entry + 1
  504. for t in self.title_news_stream:
  505. if term_reply in t:
  506. counter_term_title = counter_term_title + 1
  507. titles_stream.append(t)
  508. counter_term = counter_term_body + counter_term_entry + counter_term_title
  509. if counter_term < 2 and counter_term > 0:
  510. ct = "vez"
  511. elif counter_term == 0:
  512. ct = "veces"
  513. else:
  514. ct = "veces"
  515. if counter_term_body < 2 and counter_term_body > 0:
  516. cb = "artículo"
  517. elif counter_term_body == 0:
  518. cb = "artículos"
  519. else:
  520. cb = "artículos"
  521. if counter_term_entry < 2 and counter_term_entry > 0:
  522. ce = "entrada"
  523. elif counter_term_entry == 0:
  524. ce = "entradas"
  525. else:
  526. ce = "entradas"
  527. if counter_term_title < 2 and counter_term_title > 0:
  528. cl = "titular"
  529. elif counter_term_title == 0:
  530. cl = "titulares"
  531. else:
  532. cl = "titulares"
  533. if self.options.tsource:
  534. print "\n" + '-'*25
  535. print "\n[Info] Mostrando resultados en:", str(self.options.tsource)
  536. print "\n [+] Aparece en: ("+str(counter_term_body)+" "+str(cb)+"), ("+str(counter_term_title)+" "+str(cl)+") y ("+str(counter_term_entry)+" "+str(ce)+")"
  537. print " [+] Sale un total de: ("+str(counter_term)+" "+str(ct)+")" + "\n"
  538. def run(self, opts=None):
  539. options = self.create_options(opts)
  540. self.set_options(options)
  541. options = self.options
  542. if options.update: # update tool
  543. self.banner()
  544. try:
  545. print("\n [Info] Tratando de actualizar a la última versión estable:\n")
  546. Updater()
  547. except:
  548. print("\n [Error] Algo ha ido mal!. Para hacer funcionar ésta opción, deberías clonar Propagare lanzando:\n")
  549. print(" $ git clone https://github.com/epsylon/propagare\n")
  550. if options.view_media: # show supported sources
  551. self.banner()
  552. print("\n[Info] Listado de fuentes soportadas:\n")
  553. n = 0
  554. for m in self.supported_media:
  555. n = n + 1
  556. print " + ["+str(n)+"]:", m
  557. print "" # zen out
  558. if options.stats: # show archive stats
  559. self.banner()
  560. self.stats()
  561. if options.term: # start a 'semantic' search for a term
  562. self.banner()
  563. self.search()
  564. if options.news: # extract news (general run)
  565. try:
  566. self.banner()
  567. print "\n[Info] Buscando las fuentes de datos...\n"
  568. print "[Info] Examinando el contenido en línea...\n"
  569. sep='=='
  570. sep2='?'
  571. art_url=''
  572. art_url_list=[]
  573. art_title=''
  574. art_author=''
  575. art_location=''
  576. num=0
  577. flag=True
  578. if options.esource: # user has set a specific source
  579. if options.esource in self.supported_media: # source is supported
  580. self.supported_media = options.esource
  581. else:
  582. print "-"*25
  583. print('\n[Error] La fuente que has indicado no está soportada! \n')
  584. print "-"*25
  585. print("\n[Info] Listado de fuentes soportadas :\n")
  586. n = 0
  587. for m in self.supported_media:
  588. n = n + 1
  589. print " + ["+str(n)+"]:", m
  590. print "" # zen out
  591. return
  592. self.create_sources_list()
  593. for n in self.sources: # n = news media source
  594. if n.endswith(""):
  595. n_url = n.replace("", "/")
  596. if not n.startswith("http"):
  597. if n == "elmundo.es": # this media only supports access using www
  598. n_url = "https://www." + n
  599. else:
  600. n_url = "https://" + n # SSL only
  601. print "- Visitando:", n_url
  602. self.user_agent = random.choice(self.agents).strip() # suffle user-agent
  603. headers = {'User-Agent' : self.user_agent, 'Referer' : self.referer} # set fake user-agent and referer
  604. try:
  605. reply = urllib2.urlopen(n_url, context=self.ctx).read()
  606. except:
  607. print('\n[Error] - Imposible conectar con: ') + n
  608. pass
  609. f = open('sources/'+ n)
  610. regex = f.readlines()
  611. f.close()
  612. #print reply # nice to have this output for dev new modules
  613. for r in regex: # extract specific keywords from news: time, author, url (+variations), title, description, body
  614. if ('art_url==' or 'art_url2==') in r:
  615. art_url = r
  616. regex_art_url = str(art_url.split(sep, 1)[1]) # regex magics (art_url)
  617. pattern_art_url = re.compile(regex_art_url)
  618. if 'art_author==' in r:
  619. art_author = r
  620. regex_art_author = str(art_author.split(sep, 1)[1]) # regex magics (art_author)
  621. pattern_art_author = re.compile(regex_art_author)
  622. if 'art_time==' in r:
  623. art_time = r
  624. regex_art_time = str(art_time.split(sep, 1)[1]) # regex magics (art_time)
  625. pattern_art_time = re.compile(regex_art_time)
  626. if 'art_title==' in r:
  627. art_title = r
  628. regex_art_title = str(art_title.split(sep, 1)[1]) # regex magics (art_title)
  629. pattern_art_title = re.compile(regex_art_title)
  630. if 'art_description==' in r:
  631. art_description = r
  632. regex_art_description = str(art_description.split(sep, 1)[1]) # regex magics (art_description)
  633. pattern_art_description = re.compile(regex_art_description)
  634. if 'art_body==' in r:
  635. art_body = r
  636. regex_art_body = str(art_body.split(sep, 1)[1]) # regex magics (art_body)
  637. pattern_art_body = re.compile(regex_art_body, re.MULTILINE)
  638. try:
  639. art_url_found = re.findall(pattern_art_url, reply) # found art_url patterns on main page
  640. art_url_parsed = self.check_art_repetitions(n, art_url_found) # discard results previously stored
  641. except:
  642. art_url_parsed = None
  643. pass
  644. art_stored = self.count_art_stored(n)
  645. if not art_url_parsed and not art_stored:
  646. print "\n[Info] Nuevos artículos encontrados: 0 | Total artículos almacenados (de ésta fuente): 0\n"
  647. return
  648. if not art_url_parsed and art_stored > 0: # not any new article found + some articles stored
  649. pass
  650. elif len(art_url_parsed) is 0 and art_stored is 0: # not any new article found + not any article stored
  651. print "\n[Info] Nuevos artículos encontrados: 0 | Total artículos almacenados (de ésta fuente): " + str(art_stored) + "\n"
  652. return
  653. else: # new article found
  654. print "" # zen out
  655. for a in art_url_parsed:
  656. if "elmundo.es" in n: # re-parsing website: elmundo.es [10/05/2018]
  657. if '"' in a:
  658. a = str(a.split('"', 1)[0])
  659. if '#' in a:
  660. a = str(a.split('#', 1)[0])
  661. if not "http" in a or not "www.elmundo.es" in a or "vivienda" in a or "horoscopo" in a or "menu" in a or "?" in a or "indices" in a or "programacion-tv" in a:
  662. a = None
  663. if "eldiario.es" in n: # re-parsing website: eldiario.es [09/05/2018]
  664. if '" title="' in a:
  665. a = str(a.split('"', 1)[0])
  666. if '">' in a:
  667. a = str(a.split('"', 1)[0])
  668. if "rastreador" in a or "http" in a or "autores" in a or "www.eldiario.es" in a or "/carnecruda" in a or "/contenido_patrocinado" in a:
  669. a = None
  670. if "elpais.com" in n: # re-parsing website: elpais.com [24/04/2018]
  671. if "?" in a:
  672. a = str(a.split(sep2, 1)[0])
  673. if "posicionador" in a:
  674. a = a.replace('" class="posicionador',"")
  675. if "elpais.com" in a:
  676. a = "https:/" + a
  677. else:
  678. a = n_url + "/" + a
  679. if a is not None:
  680. if "eldiario.es" in n: # re-parsing website: eldiario.es [09/05/2018]
  681. a = "https://eldiario.es" + a
  682. if a not in art_url_list:
  683. check_pass = self.check_art_exists(a, n) # check if art found is previously stored for this media
  684. if check_pass is True or check_pass is None:
  685. art_url_list.append(a) # crawlered pages from main website
  686. print " + [IA]:", a
  687. if not art_url_list:
  688. pass
  689. else:
  690. print "\n[Info] Nuevos artículos encontrados: " + str(len(art_url_list)) + " | Total artículos almacenados (de ésta fuente): " + str(art_stored) + "\n"
  691. print "- Extrayendo:", n_url
  692. if not os.path.exists('data/'):
  693. os.makedirs('data/')
  694. if not os.path.exists('data/' + n):
  695. os.makedirs('data/' + n)
  696. for a in art_url_list:
  697. num=num+1 # art counter
  698. json_data = {} # json dict stream buffer
  699. if '"' in a: # re-parse url searching for " after it
  700. sep = '"'
  701. a = str(a.split(sep, 1)[0])
  702. print "\n + ["+str(num)+"/"+str(len(art_url_list))+"] Visitando:", a
  703. self.user_agent = random.choice(self.agents).strip() # suffle user-agent
  704. headers = {'User-Agent' : self.user_agent, 'Referer' : self.referer} # set user-agent and referer
  705. try:
  706. reply_art = urllib2.urlopen(a, context=self.ctx).read()
  707. except:
  708. print('\n[Error] - Imposible conectar con: ') + a
  709. return
  710. art_url_author_found = re.findall(pattern_art_author, reply_art) # found art_author pattern on page
  711. if not art_url_author_found:
  712. for r in regex: # extract another combination
  713. if 'art_author2==' in r:
  714. art_author = r
  715. try:
  716. regex_art_author = str(art_author.split(sep, 1)[1]) # re-regex magics (art_author)
  717. pattern_art_author = re.compile(regex_art_author)
  718. except:
  719. break
  720. art_url_author_found = re.findall(pattern_art_author, reply_art) # try another art_author pattern on page
  721. if not art_url_author_found: # not any author found using regex (use default for each media)
  722. if "elmundo.es" in n: # default author for elmundo.es when not signed
  723. art_url_author_found.append("Ediciones El Mundo")
  724. if "elpais.com" in n: # default author for elpais.com when not signed
  725. art_url_author_found.append("Ediciones El País")
  726. if "eldiario.es" in n: # default author for elpais.com when not signed
  727. art_url_author_found.append("Ediciones El Diario")
  728. else:
  729. if "elpais.com" in n: # based on specific reg exp.
  730. sep = '"'
  731. for author in art_url_author_found:
  732. art_url_author_found.remove(author)
  733. author = str(author.split(sep, 1)[0])
  734. art_url_author_found.append(author)
  735. art_url_time_found = re.findall(pattern_art_time, reply_art) # found art_time pattern on page
  736. art_url_title_found = re.findall(pattern_art_title, reply_art) # found art_title pattern on page
  737. art_url_description_found = re.findall(pattern_art_description, reply_art) # found art_description pattern on page
  738. art_url_body_found = re.findall(pattern_art_body, reply_art) # found art_body pattern on page (MULTILIN)
  739. if not art_url_body_found: # not any body found
  740. if "eldiario.es" in n:
  741. for r in regex: # extract another combination
  742. if 'art_body2==' in r:
  743. art_body = r
  744. regex_art_body = str(art_body.split(sep, 1)[1])
  745. pattern_art_body = re.compile(regex_art_body)
  746. art_url_body_found = re.findall(pattern_art_body, reply_art)
  747. time.sleep(0.1) # tic, tac!!!
  748. self.update_progress("\n - ETA", num, len(art_url_list))
  749. print "" # zen out
  750. if "elmundo.es" in a: # [10/05/2018] # schema: https://www.elmundo.es/category/{tag}/date/ID
  751. a_path = a.replace("http://www.elmundo.es/","") # remove pre-url (note: http)
  752. a_path = a_path.replace(".html","") # remove post-url
  753. if "elpais.com" in a: # [24/04/2018] # schema: https://elpais.com/category/date/tag/ID
  754. a_path = a.replace("https://elpais.com/","") # remove pre-url
  755. a_path = a_path.split(".html") # remove post-url
  756. if "elpais.com" in a_path: # re-parsing url [24/04/2018] # schema: https://loc.elpais.com/...
  757. a_path = a_path.split("elpais.com/")
  758. a_path = a_path[1]
  759. if "eldiario.es" in a: # [08/05/2018] # schema: https://eldiario.es/category/{tag}/ID
  760. a_path = a.replace("https://eldiario.es/","") # remove pre-url
  761. a_path = a_path.replace(".html","") # remove post-url
  762. if "/" in a_path: # / mostly used like url-category sep keyword
  763. a_path = a_path.split("/")
  764. if "elmundo.es" in a:
  765. try: # try with /tag/
  766. category = a_path[0]
  767. date_year = a_path[2]
  768. date_month = a_path[3]
  769. date_day = a_path[4]
  770. date = date_year + "_" + date_month + "_" + date_day # date: year/month/day
  771. tag = a_path[1]
  772. ID = a_path[5]
  773. if not os.path.exists('data/'+n+"/"+category):
  774. os.makedirs('data/'+n+"/"+category)
  775. if not os.path.exists('data/'+n+"/"+category+"/"+date):
  776. os.makedirs('data/'+n+"/"+category+"/"+date)
  777. if not os.path.exists('data/'+n+"/"+category+"/"+date+"/"+tag): # create new record
  778. os.makedirs('data/'+n+"/"+category+"/"+date+"/"+tag)
  779. path = 'data/'+n+"/"+category+"/"+date+"/"+tag+"/"+ID+".txt" # set path to file
  780. self.generate_json(n, category, date, tag, ID) # generate .json
  781. except:
  782. try:
  783. category = a_path[0]
  784. date_year = a_path[1]
  785. date_month = a_path[2]
  786. date_day = a_path[3]
  787. date = date_year + "_" + date_month + "_" + date_day # date: year/month/day
  788. ID = a_path[4]
  789. if not os.path.exists('data/'+n+"/"+category):
  790. os.makedirs('data/'+n+"/"+category)
  791. if not os.path.exists('data/'+n+"/"+category+"/"+date):
  792. os.makedirs('data/'+n+"/"+category+"/"+date)
  793. path = 'data/'+n+"/"+category+"/"+date+"/"+ID+".txt" # set path to file
  794. self.generate_json(n, category, date, None, ID) # generate .json
  795. except:
  796. pass
  797. if "eldiario.es" in a:
  798. category = a_path[0]
  799. ID = a_path[1]
  800. if not os.path.exists('data/'+n+"/"+category):
  801. os.makedirs('data/'+n+"/"+category)
  802. if not os.path.exists('data/'+n+"/"+category+"/"+ID):
  803. os.makedirs('data/'+n+"/"+category+"/"+ID)
  804. path = 'data/'+n+"/"+category+"/"+ID+"/"+ID+".txt" # set path to file
  805. self.generate_json(n, category, None, None, ID) # generate .json
  806. if "elpais.com" in a:
  807. category = a_path[0]
  808. date_year = a_path[1]
  809. date_month = a_path[2]
  810. date_day = a_path[3]
  811. date = date_year + "_" + date_month + "_" + date_day # date: year/month/day
  812. tag = a_path[4]
  813. ID = a_path[5]
  814. if not os.path.exists('data/'+n+"/"+category):
  815. os.makedirs('data/'+n+"/"+category)
  816. if not os.path.exists('data/'+n+"/"+category+"/"+date):
  817. os.makedirs('data/'+n+"/"+category+"/"+date)
  818. if not os.path.exists('data/'+n+"/"+category+"/"+date+"/"+tag):
  819. os.makedirs('data/'+n+"/"+category+"/"+date+"/"+tag)
  820. if not os.path.exists('data/'+n+"/"+category+"/"+date+"/"+tag+"/"+ID): # create new record
  821. os.makedirs('data/'+n+"/"+category+"/"+date+"/"+tag+"/"+ID)
  822. path = 'data/'+n+"/"+category+"/"+date+"/"+tag+"/"+ID+"/"+ID+".txt" # set path to file
  823. self.generate_json(n, category, date, tag, ID) # generate .json
  824. try:
  825. fs = open(path, "w") # generate .txt
  826. except:
  827. print "\n[Error] No se pueden extraer las noticias de manera correcta. Saliendo...\n"
  828. return
  829. fs.write("Fuente: " + str(a).encode('utf-8') + "\n") # write source url
  830. json_data.update({"Fuente": str(a)})
  831. for t in art_url_time_found:
  832. fs.write("Fecha de publicación: " + str(t).encode('utf-8') + "\n") # write time
  833. json_data.update({"Fecha de publicación": str(t)})
  834. for author in art_url_author_found:
  835. if "\t" in author:
  836. author = author.split('\t', 1)[1] # re-parse for \t
  837. author = str(author.decode('ISO-8859-1').strip())
  838. fs.write("Autor(a): " + str(author).encode('utf-8') + "\n") # write author
  839. json_data.update({"Autor(a)": str(author)})
  840. for title in art_url_title_found:
  841. title = str(title.decode('ISO-8859-1').strip())
  842. parsed = self.format_content(str(title))
  843. fs.write("Titular: " + str(parsed).encode('utf-8') + "\n") # write title
  844. json_data.update({"Titular": str(parsed)})
  845. for description in art_url_description_found:
  846. description = str(description.decode('ISO-8859-1').strip())
  847. parsed = self.format_content(str(description))
  848. fs.write("Entrada: " + str(parsed).encode('utf-8') + "\n") # write description
  849. json_data.update({"Entrada": str(parsed)})
  850. body_complete = ""
  851. for body in art_url_body_found:
  852. body = str(body.decode('ISO-8859-1').strip())
  853. if "elmundo.es" in a:
  854. body = body.split("TE PUEDE INTERESAR",1)[0]
  855. if "###" in body:
  856. body = body.relace("###","")
  857. if "elpais.com" in a:
  858. body = body.replace("<span>Explora nuestras historias</span> por temas","")
  859. body = body.replace("Recibe nuestra newsletter", "")
  860. body_complete += body + "\n\n"
  861. if "elmundo.es" in a:
  862. break
  863. parsed = self.format_content(body_complete)
  864. fs.write("\n" + str(parsed).encode('utf-8')) # write (plain text) body without keyword
  865. json_data.update({"Noticia": str(parsed)})
  866. self.json_report.write(json.dumps(json_data, sort_keys=True, indent=2, separators=(',', ':'), ensure_ascii=False)) # json dump
  867. fs.close() # close .txt
  868. self.json_report.close() # close .json
  869. self.total_num = self.total_num + 1
  870. num = 0 # flush art found counter
  871. art_url_list = [] # flush art list
  872. all_art_stored = self.count_all_stored()
  873. if self.total_num:
  874. print "" # zen out
  875. else:
  876. self.total_num = 0
  877. print "\n[Info] Nuevos artículos descargados: " + str(self.total_num) + " | Total artículos almacenados (de todas las fuentes): " + str(all_art_stored) + "\n"
  878. if all_art_stored > 0:
  879. if not self.options.forceno:
  880. print "-"*25
  881. stats_reply = raw_input("¿Quieres ver las estadísticas comparadas (S/n)?\n")
  882. else:
  883. stats_reply = "N"
  884. if stats_reply == "s" or stats_reply == "S":
  885. self.stats()
  886. else:
  887. print "\n[Info] Saliendo...\n"
  888. return
  889. if not self.options.forceno:
  890. print "-"*25
  891. search_reply = raw_input("¿Quieres buscar información semántica (S/n)?\n")
  892. else:
  893. search_reply = "N"
  894. if search_reply == "s" or search_reply == "S":
  895. self.search()
  896. else:
  897. print "\n[Info] Saliendo...\n"
  898. except (KeyboardInterrupt, SystemExit):
  899. sys.exit()
  900. if __name__ == "__main__":
  901. app = Propagare()
  902. app.run()