diana.py 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-"
  3. """
  4. DiaNA - 2020 - by psy (epsylon@riseup.net)
  5. You should have received a copy of the GNU General Public License along
  6. with DiaNA; if not, write to the Free Software Foundation, Inc., 51
  7. Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  8. """
  9. VERSION = "v0.2_beta"
  10. RELEASE = "17032020"
  11. SOURCE1 = "https://code.03c8.net/epsylon/diana"
  12. SOURCE2 = "https://github.com/epsylon/diana"
  13. CONTACT = "epsylon@riseup.net - (https://03c8.net)"
  14. """
  15. DNA-equiv:
  16. A <-> T
  17. C <-> G
  18. """
  19. import re, os, glob, random, time, math
  20. brain_path = "datasets/brain.in" # in/out brain-tmp file
  21. genomes_path = 'datasets/' # genome datasets raw data
  22. genomes_list_path = "datasets/genome.list" # genome list
  23. genomes = {} # main sources dict: genome_name
  24. seeds_checked = [] # list used for random checked patterns
  25. repeats = {} # repetitions 'tmp' dict: genome_name:(repets,pattern)
  26. known_patterns = [] # list used for known patterns
  27. dna_alphabet = ["A", "C", "G", "T"] # dna alphabet
  28. max_length = 50 # [MAX. LENGTH] for range [PATTERN]
  29. def convert_size(size):
  30. if (size == 0):
  31. return '0 B'
  32. size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
  33. i = int(math.floor(math.log(size,1024)))
  34. p = math.pow(1024,i)
  35. s = round(size/p,2)
  36. return s, size_name[i]
  37. def search_pattern_with_human():
  38. pattern = input("[HUMAN] [SEARCH] Pattern (ex: attacg): ").upper()
  39. print("\n"+"-"*5 + "\n")
  40. create_new_pattern(pattern) # create new pattern
  41. def try_pattern_against_all_genomes_by_genome(pattern):
  42. for k, v in genomes.items():
  43. if pattern in v:
  44. t = len(re.findall(pattern, v))
  45. repeats[k] = t, pattern # create dict: genome = times, pattern
  46. def try_pattern_against_all_genomes_by_pattern(pattern, index):
  47. p_index = 0 # pattern index
  48. for k, v in genomes.items():
  49. if pattern in v:
  50. p_index = p_index + 1
  51. t = len(re.findall(pattern, v))
  52. repeats[index,p_index] = pattern, k, t # create dict: index, p_index = pattern, genome, times
  53. def sanitize_dna_pattern(pattern):
  54. valid_pattern = True
  55. for c in pattern:
  56. if c == "A":
  57. pass
  58. elif c == "T":
  59. pass
  60. elif c == "G":
  61. pass
  62. elif c == "C":
  63. pass
  64. elif c == "N":
  65. pass
  66. else:
  67. valid_pattern = False
  68. return valid_pattern
  69. def teach_ai():
  70. mode = input("[TRAIN-AI] MODE -> (H)uman, (A)utomata: ").upper()
  71. if not os.path.isfile(brain_path):
  72. create_initial_seed_file()
  73. if mode == "H": # human mode
  74. teach_ai_human_mode()
  75. else: # libre AI
  76. teach_ai_automata_mode() # automata mode
  77. def teach_ai_human_mode(): # search/discard patterns with human interaction & generate local database
  78. search_patterns_lesson_with_a_human()
  79. def search_patterns_lesson_with_a_human():
  80. print("\n"+"-"*30)
  81. print("\n[TRAIN-AI] [HUMAN] [STOP] this mode; just entering whatever invalid pattern (ex: 'exit' or 'q').\n")
  82. key = "K" # continue
  83. while key == "K":
  84. pattern = input("[TRAIN-AI] [HUMAN] [LOOP] [SEARCH] Pattern (ex: attacg): ").upper()
  85. print("\n"+"-"*5 + "\n")
  86. key = search_pattern_on_lesson(pattern)
  87. if key == "Z": # stop
  88. break
  89. def search_pattern_on_lesson(pattern):
  90. valid_pattern = sanitize_dna_pattern(pattern)
  91. if valid_pattern == True:
  92. key = search_pattern_on_local_database(pattern) # search pattern on local database
  93. else:
  94. print("[ERROR] -> Invalid DNA pattern ... [EXITING!]\n")
  95. key = "Z" # stop
  96. return key
  97. def search_pattern_on_local_database(pattern):
  98. f=open(brain_path, 'r')
  99. memory = f.read().replace('\n',' ')
  100. f.close()
  101. patterns_known = 0
  102. if not "'"+pattern+"'" in memory: # always create new patterns
  103. create_new_pattern(pattern) # create new pattern
  104. patterns_known = patterns_known + 1
  105. else:
  106. for k, v in genomes.items(): # create patterns found for new genomes
  107. if k not in memory:
  108. create_new_pattern(pattern) # create new pattern
  109. patterns_known = patterns_known + 1
  110. if patterns_known == 0:
  111. print("[TRAIN-AI] [AUTOMATA] [LOOP] [RESULTS] -ALREADY- [LEARNED!] ... -> [GOING FOR NEXT!]\n")
  112. print("-"*5 + "\n")
  113. key = "K" # continue
  114. return key
  115. def create_initial_seed_file():
  116. f=open(brain_path, 'w')
  117. f.write("")
  118. f.close()
  119. def create_new_pattern(pattern): # append it to brain
  120. valid_pattern = sanitize_dna_pattern(pattern)
  121. if valid_pattern == True:
  122. if pattern not in known_patterns:
  123. known_patterns.append(pattern)
  124. try_pattern_against_all_genomes_by_genome(pattern) # generate repeats dict
  125. patterns_found = 0
  126. for k, v in repeats.items(): # list patterns found to output
  127. print (" *", k +":", "-> ",v,"")
  128. patterns_found = patterns_found + 1
  129. print("")
  130. if patterns_found == 0:
  131. print("[INFO] -> Not any found! ... [EXITING!]\n")
  132. else:
  133. f=open(brain_path, 'a')
  134. f.write(str(repeats)+os.linesep) # add dict as str
  135. f.close()
  136. else:
  137. print("[ERROR] -> Invalid DNA pattern ... [EXITING!]\n")
  138. def teach_ai_automata_mode(): # search patterns by bruteforcing ranges & generate local database
  139. search_patterns_lesson_with_an_ai()
  140. def search_patterns_lesson_with_an_ai():
  141. print("\n"+"-"*30)
  142. print("\n[TRAIN-AI] [AUTOMATA] [STOP] this mode; pressing 'CTRL+z'.\n")
  143. ranges = input("[TRAIN-AI] [AUTOMATA] [SEARCH] Set range (x<y) for pattern deep searching (ex: 2-8): ")
  144. print ("")
  145. valid_range, ranged_permutations = check_for_deep_searching_ranges(ranges)
  146. if str(valid_range) == "OK!":
  147. ranged_ending = False
  148. print("-"*15)
  149. print("\n[TRAIN-AI] [AUTOMATA] [SEARCH] Number of [PERMUTATIONS] estimated: [ "+str(ranged_permutations)+" ]\n")
  150. time.sleep(10)
  151. print("-"*15+"\n")
  152. num_pat = 0
  153. time.sleep(10)
  154. while ranged_ending == False: # try to STOP it using: CTRL-z
  155. try:
  156. pattern, ranged_ending = generate_random_pattern(ranges, ranged_permutations) # generate random seed
  157. if pattern:
  158. num_pat = num_pat + 1
  159. print("[TRAIN-AI] [AUTOMATA] [LOOP] [SEARCH] Generating [RANDOM!] ["+str(num_pat)+"/"+str(ranged_permutations)+"] pattern: [ " + str(pattern) + " ]\n")
  160. if not num_pat == ranged_permutations:
  161. search_pattern_on_lesson(pattern)
  162. else:
  163. search_pattern_on_lesson(pattern)
  164. print("[TRAIN-AI] [AUTOMATA] [RESULTS]: REVIEWED -> [ "+str(ranged_permutations)+" PERMUTATIONS ] ... -> [EXITING!]\n")
  165. ranged_ending = True
  166. except:
  167. pass
  168. else:
  169. print("-"*15+"\n")
  170. print("[TRAIN-AI] [AUTOMATA] [ERROR] -> [INVALID!] Deep Learning [RANGE] -> "+valid_range+" ... [EXITING!]\n")
  171. def generate_random_pattern(ranges, ranged_permutations):
  172. ranged_length = 0
  173. try:
  174. range_low = int(ranges.split("-")[0])
  175. range_high = int(ranges.split("-")[1])
  176. for i in range(range_low, range_high+1):
  177. ranged_length = ranged_length + 1
  178. if ranged_length == ranged_permutations: # all possible variables have been bruteforced/checked! -> exit
  179. pattern = None
  180. ranged_ending = True
  181. return pattern, ranged_ending
  182. else:
  183. ranged_ending = False
  184. seed = [random.randrange(0, 4) for _ in range(i)] # generate "random" seed
  185. if seed not in seeds_checked:
  186. seeds_checked.append(seed)
  187. pattern = ""
  188. for n in seed:
  189. if n == 0:
  190. pattern += "A"
  191. elif n == 1:
  192. pattern += "C"
  193. elif n == 2:
  194. pattern += "T"
  195. else:
  196. pattern += "G"
  197. return pattern, ranged_ending
  198. except:
  199. print("[TRAIN-AI] [AUTOMATA] [ERROR] -> [INVALID!] Deep Learning [RANGE] ... [EXITING!]\n")
  200. pattern = None
  201. ranged_ending = True
  202. return pattern, ranged_ending
  203. def check_for_deep_searching_ranges(ranges):
  204. try:
  205. range_low = ranges.split("-")[0]
  206. range_high = ranges.split("-")[1]
  207. except:
  208. valid_range = "'bad format'"
  209. try:
  210. range_low = int(range_low)
  211. except:
  212. valid_range = "'low range' should be an integer"
  213. try:
  214. range_high = int(range_high)
  215. except:
  216. valid_range = "'high range' should be an integer"
  217. try:
  218. if range_low < range_high:
  219. if range_low > 1: # always range > 1
  220. valid_range = "OK!"
  221. else:
  222. valid_range = "'low range' should be > than 1"
  223. else:
  224. valid_range = "'low range' should be < than 'high range'"
  225. except:
  226. valid_range = "'bad format'"
  227. try:
  228. ranged_permutations = math_ranged_permutations(range_low, range_high)
  229. except:
  230. ranged_permutations = 0
  231. valid_range = "'bad format'"
  232. return valid_range, ranged_permutations
  233. def math_ranged_permutations(range_low, range_high): # calculate ranged_permutations
  234. ranged_permutations = 0
  235. for i in range(range_low, range_high+1):
  236. ranged_permutations = ranged_permutations + (4**i)
  237. return ranged_permutations
  238. def libre_ai(): # show statistics / download new genomes / keep crossing new genomes with local database / search for new patterns (non stop!)
  239. if not os.path.isfile(brain_path):
  240. create_initial_seed_file()
  241. memory = examine_stored_brain_memory()
  242. if memory != "":
  243. #print("[LIBRE-AI] [STOP] this mode; pressing 'CTRL+z'.\n")
  244. libre_ai_show_statistics(memory) # show statistics
  245. def libre_ai_show_statistics(memory):
  246. print("[LIBRE-AI] [REPORTING] [STATISTICS] ... -> [STARTING!]\n")
  247. print("-"*15 + "\n")
  248. total_genomes = 0
  249. total_adenine = 0
  250. total_guanine = 0
  251. total_cytosine = 0
  252. total_thymine = 0
  253. total_any = 0
  254. total_patterns = 0
  255. secuence_length = 0
  256. secuences_length_list = {}
  257. largest = None
  258. largest_len = 0
  259. shortest_len = 0
  260. average = None
  261. shortest = None
  262. for k, v in genomes.items():
  263. secuence_length = len(v)
  264. secuences_length_list[k] = str(secuence_length)
  265. total_genomes = total_genomes + 1
  266. total_adenine = total_adenine + v.count("A")
  267. total_guanine = total_guanine + v.count("G")
  268. total_cytosine = total_cytosine + v.count("C")
  269. total_thymine = total_thymine + v.count("T")
  270. total_any = total_any + v.count("N")
  271. path = genomes_path # genome datasets raw data
  272. l = glob.glob(genomes_path+"*") # black magic!
  273. latest_collection_file = max(l, key=os.path.getctime)
  274. latest_collection_date = time.ctime(os.path.getmtime(latest_collection_file))
  275. total_nucleotids = [total_adenine, total_guanine, total_cytosine, total_thymine, total_any]
  276. num_total_nucleotids = total_adenine + total_guanine + total_cytosine + total_thymine + total_any
  277. nucleotid_more_present = max(total_nucleotids)
  278. print("[LIBRE-AI] [REPORTING] -STORAGE- [STATISTICS]: \n")
  279. extract_storage_sizes()
  280. print(" * [LATEST UPDATE]: '"+str(latest_collection_date)+"'\n")
  281. print(" + File: '"+str(latest_collection_file)+"'\n")
  282. print("-"*5 + "\n")
  283. print("[LIBRE-AI] [REPORTING] -COLLECTION- [STATISTICS]: \n")
  284. extract_total_patterns_learned_from_local(memory)
  285. print("\n"+"-"*5 + "\n")
  286. print("[LIBRE-AI] [REPORTING] -ANALYSIS- [STATISTICS]: \n")
  287. print(" * Total [DNA SECUENCES]: [ "+str(total_genomes)+" ]\n")
  288. largest = 0
  289. largest_pattern_name = []
  290. largest_pattern_size = []
  291. for k, v in secuences_length_list.items():
  292. if int(v) > int(largest):
  293. largest = v
  294. largest_pattern_name.append(k)
  295. largest_pattern_size.append(largest)
  296. for p in largest_pattern_name:
  297. largest_pattern_name = p
  298. for s in largest_pattern_size:
  299. largest_pattern_size = s
  300. print(" + [LARGEST] : "+str(largest_pattern_name)+ " [ "+str(largest_pattern_size)+" bp linear RNA ]")
  301. prev_shortest = None
  302. shortest_pattern_name = []
  303. shortest_pattern_size = []
  304. for k, v in secuences_length_list.items():
  305. if prev_shortest == None:
  306. shortest = v
  307. shortest_pattern_name.append(k)
  308. shortest_pattern_size.append(shortest)
  309. prev_shortest = True
  310. else:
  311. if int(v) < int(shortest):
  312. shortest = v
  313. shortest_pattern_name.append(k)
  314. shortest_pattern_size.append(shortest)
  315. for p in shortest_pattern_name:
  316. shortest_pattern_name = p
  317. for s in shortest_pattern_size:
  318. shortest_pattern_size = s
  319. print(" + [SHORTEST]: "+str(shortest_pattern_name)+ " [ "+str(shortest_pattern_size)+" bp linear RNA ]\n")
  320. print(" * Total [NUCLEOTIDS]: [ "+str(num_total_nucleotids)+" ]\n")
  321. if nucleotid_more_present == total_adenine:
  322. print(" + [A] Adenine : "+str(total_adenine)+" <- [MAX]")
  323. else:
  324. print(" + [A] Adenine : "+str(total_adenine))
  325. if nucleotid_more_present == total_guanine:
  326. print(" + [G] Guanine : "+str(total_guanine)+" <- [MAX]")
  327. else:
  328. print(" + [G] Guanine : "+str(total_guanine))
  329. if nucleotid_more_present == total_cytosine:
  330. print(" + [C] Cytosine : "+str(total_cytosine)+" <- [MAX]")
  331. else:
  332. print(" + [C] Cytosine : "+str(total_cytosine))
  333. if nucleotid_more_present == total_thymine:
  334. print(" + [T] Thymine : "+str(total_thymine)+" <- [MAX]")
  335. else:
  336. print(" + [T] Thymine : "+str(total_thymine))
  337. if total_any > 0:
  338. if nucleotid_more_present == total_any:
  339. print(" + [N] *ANY* : "+str(total_any)+" <- [MAX]\n")
  340. else:
  341. print(" + [N] *ANY* : "+str(total_any)+"\n")
  342. print("-"*5 + "\n")
  343. extract_pattern_most_present_local(memory)
  344. def convert_memory_to_dict(memory): # [index] = genome_name, pattern, num_rep
  345. memory_dict = {}
  346. index = 0
  347. for m in memory:
  348. regex_record = "'(.+?)': (.+?), '(.+?)'" # regex magics! - extract first each record
  349. pattern_record = re.compile(regex_record)
  350. record = re.findall(pattern_record, m)
  351. for r in record: # now extract each field
  352. index = index + 1
  353. name = str(r).split("', '(")[0]
  354. genome_name = str(name).split("'")[1]
  355. repeats = str(r).split("', '(")[1]
  356. genome_repeats = str(repeats).split("',")[0]
  357. pattern = str(repeats).split("',")[1]
  358. genome_pattern = pattern.replace(" ", "")
  359. genome_pattern = genome_pattern.replace("'", "")
  360. genome_pattern = genome_pattern.replace(")", "")
  361. memory_dict[index] = genome_name, genome_pattern, genome_repeats # generate memory_dict!
  362. return memory_dict
  363. def extract_pattern_most_present_local(memory):
  364. memory_dict = convert_memory_to_dict(memory)
  365. if memory_dict:
  366. print("[LIBRE-AI] [REPORTING] -RESEARCHING- [STATISTICS]: \n")
  367. total_genomes = 0
  368. total_patterns = 0
  369. for k, v in genomes.items():
  370. total_genomes = total_genomes + 1
  371. for m in memory:
  372. total_patterns = total_patterns + 1 # counter used for known patterns
  373. max_size_pattern_name, less_size_pattern_name, biggest_pattern_name, biggest_pattern_size, smaller_pattern_name, smaller_pattern_size, total_patterns_all_genomes, most_present_patterns_by_len_list, less_present_patterns_by_len_list = extract_patterns_most_found_in_all_genomes(memory_dict)
  374. print(" * Trying -[ "+str(total_patterns)+" ]- [PATTERNS LEARNED!] against -[ "+str(total_genomes)+ " ]- [DNA SECUENCES]:")
  375. print("\n + Total [PATTERNS FOUND!]: [ "+str(total_patterns_all_genomes)+" ]")
  376. print("\n - [MOST-PRESENT!]: [ "+str(biggest_pattern_size)+" ] time(s) -> [ "+str(biggest_pattern_name)+" ]\n")
  377. for k, v in most_present_patterns_by_len_list.items():
  378. print(" * [length = "+str(k)+"] : [ "+str(v[1])+" ] time(s) -> [ "+str(v[0])+" ]")
  379. print("\n - [LESS-PRESENT!]: [ "+str(smaller_pattern_size)+" ] time(s) -> [ "+str(smaller_pattern_name)+" ]\n")
  380. for n, m in less_present_patterns_by_len_list.items():
  381. print(" * [length = "+str(n)+"] : [ "+str(m[1])+" ] time(s) -> [ "+str(m[0])+" ]")
  382. max_size_pattern_name = max(most_present_patterns_by_len_list, key=most_present_patterns_by_len_list.get)
  383. less_size_pattern_name = min(most_present_patterns_by_len_list, key=most_present_patterns_by_len_list.get)
  384. print("\n - [LARGEST] : [ "+str(max_size_pattern_name)+" ] bp linear RNA")
  385. print(" - [SHORTEST]: [ "+str(less_size_pattern_name)+" ] bp linear RNA\n")
  386. def extract_patterns_most_found_in_all_genomes(memory_dict):
  387. present_patterns = []
  388. for m, p in memory_dict.items():
  389. pattern = p[1]
  390. if pattern not in present_patterns:
  391. present_patterns.append(pattern)
  392. index = 0 # genome num index
  393. for pattern in present_patterns:
  394. index = index + 1
  395. try_pattern_against_all_genomes_by_pattern(pattern, index)
  396. total_patterns_all_genomes = 0
  397. largest_size_by_pattern = {}
  398. largest_size_by_pattern_index = 0
  399. for k,v in repeats.items():
  400. largest_size_by_pattern_index = largest_size_by_pattern_index + 1
  401. total_patterns_all_genomes = total_patterns_all_genomes + v[2] # total patterns all genomes
  402. largest_size_by_pattern[largest_size_by_pattern_index] = v[0], v[2]
  403. total_patterns_by_pattern = 0
  404. list_total_patterns_by_pattern = {}
  405. for i, v in largest_size_by_pattern.items():
  406. total_patterns_by_pattern = total_patterns_by_pattern + v[1]
  407. list_total_patterns_by_pattern[v[0]] = total_patterns_by_pattern
  408. total_patterns_by_pattern = 0 # reset patterns counter
  409. biggest_pattern_name = None
  410. biggest_pattern_size = 0
  411. smaller_pattern_name = None
  412. smaller_pattern_size = 0
  413. max_size_pattern = 0
  414. for r, z in list_total_patterns_by_pattern.items():
  415. pattern_length = len(r)
  416. if pattern_length > max_size_pattern:
  417. max_size_pattern_name = r
  418. if biggest_pattern_name == None:
  419. biggest_pattern_name = r
  420. smaller_pattern_name = r
  421. biggest_pattern_size = z
  422. smaller_pattern_size = z
  423. less_size_pattern_name = r
  424. less_size_pattern_size = z
  425. else:
  426. if pattern_length < less_size_pattern_size:
  427. less_size_pattern_size = pattern_length
  428. less_size_pattern_name = r
  429. if z > biggest_pattern_size:
  430. biggest_pattern_name = r
  431. biggest_pattern_size = z
  432. else:
  433. if z < smaller_pattern_size:
  434. smaller_pattern_name = r
  435. smaller_pattern_size = z
  436. most_present_patterns_by_len_list = extract_most_present_pattern_by_len(list_total_patterns_by_pattern)
  437. less_present_patterns_by_len_list = extract_less_present_pattern_by_len(list_total_patterns_by_pattern)
  438. return max_size_pattern_name, less_size_pattern_name, biggest_pattern_name, biggest_pattern_size, smaller_pattern_name, smaller_pattern_size, total_patterns_all_genomes, most_present_patterns_by_len_list, less_present_patterns_by_len_list
  439. def extract_most_present_pattern_by_len(list_total_patterns_by_pattern):
  440. most_present_patterns_by_len_list = {}
  441. for k, v in list_total_patterns_by_pattern.items():
  442. pattern_len = len(k)
  443. if pattern_len in most_present_patterns_by_len_list.keys():
  444. if v > most_present_patterns_by_len_list[pattern_len][1]:
  445. most_present_patterns_by_len_list[pattern_len] = k, v
  446. else:
  447. most_present_patterns_by_len_list[pattern_len] = k, v
  448. return most_present_patterns_by_len_list
  449. def extract_less_present_pattern_by_len(list_total_patterns_by_pattern):
  450. less_present_patterns_by_len_list = {}
  451. for k, v in list_total_patterns_by_pattern.items():
  452. pattern_len = len(k)
  453. if pattern_len in less_present_patterns_by_len_list.keys():
  454. if v < less_present_patterns_by_len_list[pattern_len][1]:
  455. less_present_patterns_by_len_list[pattern_len] = k, v
  456. else:
  457. less_present_patterns_by_len_list[pattern_len] = k, v
  458. return less_present_patterns_by_len_list
  459. def extract_storage_sizes():
  460. total_dataset_size = 0
  461. total_files_size = 0
  462. total_list_size = 0
  463. for file in glob.iglob(genomes_path + '*/*/*', recursive=True): # extract datasets sizes
  464. if(file.endswith(".genome")):
  465. total_dataset_size = total_dataset_size + len(file)
  466. try:
  467. f=open(brain_path, "r") # extract brain sizes
  468. total_brain_size = len(f.read())
  469. f.close()
  470. except:
  471. total_brain_size = 0
  472. try:
  473. f=open(genomes_list_path, "r") # extract genomes list sizes
  474. total_list_size = len(f.read())
  475. f.close()
  476. except:
  477. total_list_size = 0
  478. if total_dataset_size > 0:
  479. total_files_size = int(total_files_size) + int(total_dataset_size)
  480. dataset_s, dataset_size_name = convert_size(total_dataset_size)
  481. total_dataset_size = '%s %s' % (dataset_s,dataset_size_name)
  482. if total_brain_size > 0:
  483. total_files_size = int(total_files_size) + int(total_brain_size)
  484. brain_s, brain_size_name = convert_size(total_brain_size)
  485. total_brain_size = '%s %s' % (brain_s,brain_size_name)
  486. if total_list_size > 0:
  487. total_files_size = int(total_files_size) + int(total_list_size)
  488. list_s, list_size_name = convert_size(total_list_size)
  489. total_list_size = '%s %s' % (list_s,list_size_name)
  490. total_s, total_size_name = convert_size(total_files_size)
  491. total_files_size = '%s %s' % (total_s,total_size_name)
  492. print(" * Total [FILE SIZES]: "+str(total_files_size)+"\n")
  493. if total_dataset_size:
  494. print(" + [DATASET]: "+str(total_dataset_size))
  495. if total_list_size:
  496. print(" + [LIST]: "+str(total_list_size))
  497. if total_brain_size:
  498. print(" + [BRAIN]: "+str(total_brain_size)+"\n")
  499. def extract_total_patterns_learned_from_local(memory):
  500. total_patterns = 0
  501. for m in memory:
  502. total_patterns = total_patterns + 1
  503. print(" * [SETTINGS] Using [MAX. LENGTH] for range [PATTERN] = [ "+str(max_length)+" ]\n")
  504. if total_patterns > 0:
  505. print(" + [PATTERNS LEARNED!]: [ "+str(total_patterns)+" ]\n")
  506. else:
  507. print(" + [PATTERNS LEARNED!]: [ "+str(total_patterns)+" ]")
  508. generate_pattern_len_report_structure(memory)
  509. return memory
  510. def list_genomes_on_database():
  511. print("[LIST] [REPORTING] [DNA SECUENCES] ... -> [STARTING!]\n")
  512. print("-"*15 + "\n")
  513. f=open(genomes_list_path, 'w')
  514. for k, v in genomes.items():
  515. print ("*"+str(k)+ "-> [ "+str(len(v))+" bp linear RNA ]")
  516. print (" + [A] Adenine :", str(v.count("A")))
  517. print (" + [G] Guanine :", str(v.count("G")))
  518. print (" + [C] Cytosine :", str(v.count("C")))
  519. print (" + [T] Thymine :", str(v.count("T")))
  520. f.write(str("*"+ str(k)+ " -> [ "+str(len(v))+"bp linear RNA ]\n"))
  521. f.write(str(" + [A] Adenine : " + str(v.count("A"))+"\n"))
  522. f.write(str(" + [G] Guanine : " + str(v.count("G"))+"\n"))
  523. f.write(str(" + [C] Cytosine : " + str(v.count("C"))+"\n"))
  524. f.write(str(" + [T] Thymine : " + str(v.count("T"))+"\n"))
  525. if v.count("N") > 0:
  526. print (" + [N] *ANY* :", str(v.count("N")))
  527. f.write(str(" + [N] *ANY* : "+ str(v.count("N"))+"\n"))
  528. print ("")
  529. f.write("\n")
  530. print("-"*15 + "\n")
  531. print ("[LIST] [INFO] [SAVED!] at: '"+str(genomes_list_path)+"'... -> [EXITING!]\n")
  532. f.close()
  533. def examine_stored_brain_memory():
  534. memory = [] # list used as hot-memory
  535. f=open(brain_path, 'r')
  536. for line in f.readlines():
  537. if line not in memory:
  538. memory.append(line)
  539. f.close()
  540. if memory == "": # first time run!
  541. print ("[LIBRE-AI] [INFO] Not any [BRAIN] present ... -> [BUILDING ONE!]\n")
  542. print("-"*15 + "\n")
  543. for i in range(2, 11+1):
  544. seed = [random.randrange(0, 4) for _ in range(i)] # generate "static" genesis seed
  545. if seed not in seeds_checked:
  546. seeds_checked.append(seed)
  547. pattern = ""
  548. for n in seed:
  549. if n == 0:
  550. pattern += "A"
  551. elif n == 1:
  552. pattern += "C"
  553. elif n == 2:
  554. pattern += "T"
  555. else:
  556. pattern += "G"
  557. print("[LIBRE-AI] [SEARCH] Generating [RANDOM] pattern: " + str(pattern) + "\n")
  558. create_new_pattern(pattern) # create new pattern
  559. print("-"*15 + "\n")
  560. print ("[LIBRE-AI] [INFO] A new [BRAIN] has been created !!! ... -> [ADVANCING!]\n")
  561. f=open(brain_path, 'r')
  562. memory = f.read().replace('\n',' ')
  563. f.close()
  564. return memory
  565. def generate_pattern_len_report_structure(memory):
  566. pattern_len_1 = 0 # related with [MAX. LENGTH] range
  567. pattern_len_2 = 0
  568. pattern_len_3 = 0
  569. pattern_len_4 = 0
  570. pattern_len_5 = 0
  571. pattern_len_6 = 0
  572. pattern_len_7 = 0
  573. pattern_len_8 = 0
  574. pattern_len_9 = 0
  575. pattern_len_10 = 0
  576. pattern_len_11 = 0
  577. pattern_len_12 = 0
  578. pattern_len_13 = 0
  579. pattern_len_14 = 0
  580. pattern_len_15 = 0
  581. pattern_len_16 = 0
  582. pattern_len_17 = 0
  583. pattern_len_18 = 0
  584. pattern_len_19 = 0
  585. pattern_len_20 = 0
  586. pattern_len_21 = 0
  587. pattern_len_22 = 0
  588. pattern_len_23 = 0
  589. pattern_len_24 = 0
  590. pattern_len_25 = 0
  591. pattern_len_26 = 0
  592. pattern_len_27 = 0
  593. pattern_len_28 = 0
  594. pattern_len_29 = 0
  595. pattern_len_30 = 0
  596. pattern_len_31 = 0
  597. pattern_len_32 = 0
  598. pattern_len_33 = 0
  599. pattern_len_34 = 0
  600. pattern_len_35 = 0
  601. pattern_len_36 = 0
  602. pattern_len_37 = 0
  603. pattern_len_38 = 0
  604. pattern_len_39 = 0
  605. pattern_len_40 = 0
  606. pattern_len_41 = 0
  607. pattern_len_42 = 0
  608. pattern_len_43 = 0
  609. pattern_len_44 = 0
  610. pattern_len_45 = 0
  611. pattern_len_46 = 0
  612. pattern_len_47 = 0
  613. pattern_len_48 = 0
  614. pattern_len_49 = 0
  615. pattern_len_50 = 0
  616. for m in memory:
  617. try:
  618. pattern_len = m.split(", '")[1]
  619. pattern_len = pattern_len.split("')")[0]
  620. pattern_len = len(pattern_len)
  621. except:
  622. pattern_len = 0 # discard!
  623. if pattern_len == 1:
  624. pattern_len_1 = pattern_len_1 + 1
  625. elif pattern_len == 2:
  626. pattern_len_2 = pattern_len_2 + 1
  627. elif pattern_len == 3:
  628. pattern_len_3 = pattern_len_3 + 1
  629. elif pattern_len == 4:
  630. pattern_len_4 = pattern_len_4 + 1
  631. elif pattern_len == 5:
  632. pattern_len_5 = pattern_len_5 + 1
  633. elif pattern_len == 6:
  634. pattern_len_6 = pattern_len_6 + 1
  635. elif pattern_len == 7:
  636. pattern_len_7 = pattern_len_7 + 1
  637. elif pattern_len == 8:
  638. pattern_len_8 = pattern_len_8 + 1
  639. elif pattern_len == 9:
  640. pattern_len_9 = pattern_len_9 + 1
  641. elif pattern_len == 10:
  642. pattern_len_10 = pattern_len_10 + 1
  643. elif pattern_len == 11:
  644. pattern_len_11 = pattern_len_11 + 1
  645. elif pattern_len == 12:
  646. pattern_len_12 = pattern_len_12 + 1
  647. elif pattern_len == 13:
  648. pattern_len_13 = pattern_len_13 + 1
  649. elif pattern_len == 14:
  650. pattern_len_14 = pattern_len_14 + 1
  651. elif pattern_len == 15:
  652. pattern_len_15 = pattern_len_15 + 1
  653. elif pattern_len == 16:
  654. pattern_len_16 = pattern_len_16 + 1
  655. elif pattern_len == 17:
  656. pattern_len_17 = pattern_len_17 + 1
  657. elif pattern_len == 18:
  658. pattern_len_18 = pattern_len_18 + 1
  659. elif pattern_len == 19:
  660. pattern_len_19 = pattern_len_19 + 1
  661. elif pattern_len == 20:
  662. pattern_len_20 = pattern_len_20 + 1
  663. elif pattern_len == 21:
  664. pattern_len_21 = pattern_len_21 + 1
  665. elif pattern_len == 22:
  666. pattern_len_22 = pattern_len_22 + 1
  667. elif pattern_len == 23:
  668. pattern_len_23 = pattern_len_23 + 1
  669. elif pattern_len == 24:
  670. pattern_len_24 = pattern_len_24 + 1
  671. elif pattern_len == 25:
  672. pattern_len_25 = pattern_len_25 + 1
  673. elif pattern_len == 26:
  674. pattern_len_26 = pattern_len_26 + 1
  675. elif pattern_len == 27:
  676. pattern_len_27 = pattern_len_27 + 1
  677. elif pattern_len == 28:
  678. pattern_len_28 = pattern_len_28 + 1
  679. elif pattern_len == 29:
  680. pattern_len_29 = pattern_len_29 + 1
  681. elif pattern_len == 30:
  682. pattern_len_30 = pattern_len_30 + 1
  683. elif pattern_len == 31:
  684. pattern_len_31 = pattern_len_31 + 1
  685. elif pattern_len == 32:
  686. pattern_len_32 = pattern_len_32 + 1
  687. elif pattern_len == 33:
  688. pattern_len_33 = pattern_len_33 + 1
  689. elif pattern_len == 34:
  690. pattern_len_34 = pattern_len_34 + 1
  691. elif pattern_len == 35:
  692. pattern_len_35 = pattern_len_35 + 1
  693. elif pattern_len == 36:
  694. pattern_len_36 = pattern_len_36 + 1
  695. elif pattern_len == 37:
  696. pattern_len_37 = pattern_len_37 + 1
  697. elif pattern_len == 38:
  698. pattern_len_38 = pattern_len_38 + 1
  699. elif pattern_len == 39:
  700. pattern_len_39 = pattern_len_39 + 1
  701. elif pattern_len == 40:
  702. pattern_len_40 = pattern_len_40 + 1
  703. elif pattern_len == 41:
  704. pattern_len_41 = pattern_len_41 + 1
  705. elif pattern_len == 42:
  706. pattern_len_42 = pattern_len_42 + 1
  707. elif pattern_len == 43:
  708. pattern_len_43 = pattern_len_43 + 1
  709. elif pattern_len == 44:
  710. pattern_len_44 = pattern_len_44 + 1
  711. elif pattern_len == 45:
  712. pattern_len_45 = pattern_len_45 + 1
  713. elif pattern_len == 46:
  714. pattern_len_46 = pattern_len_46 + 1
  715. elif pattern_len == 47:
  716. pattern_len_47 = pattern_len_47 + 1
  717. elif pattern_len == 48:
  718. pattern_len_48 = pattern_len_48 + 1
  719. elif pattern_len == 49:
  720. pattern_len_49 = pattern_len_49 + 1
  721. elif pattern_len == 50:
  722. pattern_len_50 = pattern_len_50 + 1
  723. else:
  724. pass
  725. if pattern_len_1 > 0:
  726. print(" - [length = 1] : [ "+str(pattern_len_1)+" ]")
  727. if pattern_len_2 > 0:
  728. print(" - [length = 2] : [ "+str(pattern_len_2)+" ]")
  729. if pattern_len_3 > 0:
  730. print(" - [length = 3] : [ "+str(pattern_len_3)+" ]")
  731. if pattern_len_4 > 0:
  732. print(" - [length = 4] : [ "+str(pattern_len_4)+" ]")
  733. if pattern_len_5 > 0:
  734. print(" - [length = 5] : [ "+str(pattern_len_5)+" ]")
  735. if pattern_len_6 > 0:
  736. print(" - [length = 6] : [ "+str(pattern_len_6)+" ]")
  737. if pattern_len_7 > 0:
  738. print(" - [length = 7] : [ "+str(pattern_len_7)+" ]")
  739. if pattern_len_8 > 0:
  740. print(" - [length = 8] : [ "+str(pattern_len_8)+" ]")
  741. if pattern_len_9 > 0:
  742. print(" - [length = 9] : [ "+str(pattern_len_9)+" ]")
  743. if pattern_len_10 > 0:
  744. print(" - [length = 10]: [ "+str(pattern_len_10)+" ]")
  745. if pattern_len_11 > 0:
  746. print(" - [length = 11]: [ "+str(pattern_len_11)+" ]")
  747. if pattern_len_12 > 0:
  748. print(" - [length = 12]: [ "+str(pattern_len_12)+" ]")
  749. if pattern_len_13 > 0:
  750. print(" - [length = 13]: [ "+str(pattern_len_13)+" ]")
  751. if pattern_len_14 > 0:
  752. print(" - [length = 14]: [ "+str(pattern_len_14)+" ]")
  753. if pattern_len_15 > 0:
  754. print(" - [length = 15]: [ "+str(pattern_len_15)+" ]")
  755. if pattern_len_16 > 0:
  756. print(" - [length = 16]: [ "+str(pattern_len_16)+" ]")
  757. if pattern_len_17 > 0:
  758. print(" - [length = 17]: [ "+str(pattern_len_17)+" ]")
  759. if pattern_len_18 > 0:
  760. print(" - [length = 18]: [ "+str(pattern_len_18)+" ]")
  761. if pattern_len_19 > 0:
  762. print(" - [length = 19]: [ "+str(pattern_len_19)+" ]")
  763. if pattern_len_20 > 0:
  764. print(" - [length = 20]: [ "+str(pattern_len_20)+" ]")
  765. if pattern_len_21 > 0:
  766. print(" - [length = 21]: [ "+str(pattern_len_21)+" ]")
  767. if pattern_len_22 > 0:
  768. print(" - [length = 22]: [ "+str(pattern_len_22)+" ]")
  769. if pattern_len_23 > 0:
  770. print(" - [length = 23]: [ "+str(pattern_len_23)+" ]")
  771. if pattern_len_24 > 0:
  772. print(" - [length = 24]: [ "+str(pattern_len_24)+" ]")
  773. if pattern_len_25 > 0:
  774. print(" - [length = 25]: [ "+str(pattern_len_25)+" ]")
  775. if pattern_len_26 > 0:
  776. print(" - [length = 26]: [ "+str(pattern_len_26)+" ]")
  777. if pattern_len_27 > 0:
  778. print(" - [length = 27]: [ "+str(pattern_len_27)+" ]")
  779. if pattern_len_28 > 0:
  780. print(" - [length = 28]: [ "+str(pattern_len_28)+" ]")
  781. if pattern_len_29 > 0:
  782. print(" - [length = 29]: [ "+str(pattern_len_29)+" ]")
  783. if pattern_len_30 > 0:
  784. print(" - [length = 30]: [ "+str(pattern_len_30)+" ]")
  785. if pattern_len_31 > 0:
  786. print(" - [length = 31]: [ "+str(pattern_len_31)+" ]")
  787. if pattern_len_32 > 0:
  788. print(" - [length = 32]: [ "+str(pattern_len_32)+" ]")
  789. if pattern_len_33 > 0:
  790. print(" - [length = 33]: [ "+str(pattern_len_33)+" ]")
  791. if pattern_len_34 > 0:
  792. print(" - [length = 34]: [ "+str(pattern_len_34)+" ]")
  793. if pattern_len_35 > 0:
  794. print(" - [length = 35]: [ "+str(pattern_len_35)+" ]")
  795. if pattern_len_36 > 0:
  796. print(" - [length = 36]: [ "+str(pattern_len_36)+" ]")
  797. if pattern_len_37 > 0:
  798. print(" - [length = 37]: [ "+str(pattern_len_37)+" ]")
  799. if pattern_len_38 > 0:
  800. print(" - [length = 38]: [ "+str(pattern_len_38)+" ]")
  801. if pattern_len_39 > 0:
  802. print(" - [length = 39]: [ "+str(pattern_len_39)+" ]")
  803. if pattern_len_40 > 0:
  804. print(" - [length = 40]: [ "+str(pattern_len_40)+" ]")
  805. if pattern_len_41 > 0:
  806. print(" - [length = 41]: [ "+str(pattern_len_41)+" ]")
  807. if pattern_len_42 > 0:
  808. print(" - [length = 42]: [ "+str(pattern_len_42)+" ]")
  809. if pattern_len_43 > 0:
  810. print(" - [length = 43]: [ "+str(pattern_len_43)+" ]")
  811. if pattern_len_44 > 0:
  812. print(" - [length = 44]: [ "+str(pattern_len_44)+" ]")
  813. if pattern_len_45 > 0:
  814. print(" - [length = 45]: [ "+str(pattern_len_45)+" ]")
  815. if pattern_len_46 > 0:
  816. print(" - [length = 46]: [ "+str(pattern_len_46)+" ]")
  817. if pattern_len_47 > 0:
  818. print(" - [length = 47]: [ "+str(pattern_len_47)+" ]")
  819. if pattern_len_48 > 0:
  820. print(" - [length = 48]: [ "+str(pattern_len_48)+" ]")
  821. if pattern_len_49 > 0:
  822. print(" - [length = 49]: [ "+str(pattern_len_49)+" ]")
  823. if pattern_len_50 > 0:
  824. print(" - [length = 50]: [ "+str(pattern_len_50)+" ]")
  825. def print_banner():
  826. print("\n"+"="*50)
  827. print(" ____ _ _ _ _ ")
  828. print("| _ \(_) __ _| \ | | / \ ")
  829. print("| | | | |/ _` | \| | / _ \ ")
  830. print("| |_| | | (_| | |\ |/ ___ \ ")
  831. print("|____/|_|\__,_|_| \_/_/ \_\ by psy")
  832. print('\n"Search and Recognize patterns in DNA sequences"')
  833. print("\n"+"="*50)
  834. print("+ GENOMES DETECTED:", str(num_files))
  835. print("="*50)
  836. print("\n"+"-"*15+"\n")
  837. print(" * VERSION: ")
  838. print(" + "+VERSION+" - (rev:"+RELEASE+")")
  839. print("\n * SOURCES:")
  840. print(" + "+SOURCE1)
  841. print(" + "+SOURCE2)
  842. print("\n * CONTACT: ")
  843. print(" + "+CONTACT+"\n")
  844. print("-"*15+"\n")
  845. print("="*50)
  846. # sub_init #
  847. num_files=0
  848. for file in glob.iglob(genomes_path + '**/*', recursive=True):
  849. if(file.endswith(".genome")):
  850. num_files = num_files + 1
  851. f=open(file, 'r')
  852. genome = f.read().replace('\n',' ')
  853. genomes[file.replace("datasets/","")] = genome.upper() # add genome to main dict
  854. f.close()
  855. print_banner() # show banner
  856. option = input("\n+ CHOOSE: (S)earch, (L)ist, (T)rain or (R)eport: ").upper()
  857. print("")
  858. print("="*50+"\n")
  859. if option == "S": # search pattern
  860. search_pattern_with_human()
  861. elif option == "L": # list genomes
  862. list_genomes_on_database()
  863. elif option == "T": # teach AI
  864. teach_ai()
  865. else: # libre AI
  866. libre_ai()
  867. print ("="*50+"\n")