diana.py 50 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-"
  3. """
  4. DiaNA - 2020 - by psy (epsylon@riseup.net)
  5. You should have received a copy of the GNU General Public License along
  6. with DiaNA; if not, write to the Free Software Foundation, Inc., 51
  7. Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  8. """
  9. VERSION = "v0.3_beta"
  10. RELEASE = "19032020"
  11. SOURCE1 = "https://code.03c8.net/epsylon/diana"
  12. SOURCE2 = "https://github.com/epsylon/diana"
  13. CONTACT = "epsylon@riseup.net - (https://03c8.net)"
  14. """
  15. DNA-equiv:
  16. A <-> T
  17. C <-> G
  18. """
  19. import re, os, glob, random, time, math
  20. brain_path = "resources/BRAIN/brain.in" # in/out brain-tmp file
  21. genomes_path = 'datasets/' # genome datasets raw data
  22. genomes_list_path = "datasets/genome.list" # genome list
  23. universal_primer_list_path = "resources/PATTERNS/UPL.list" # UPL list
  24. dna_codons_list_path = "resources/PATTERNS/DNAcodon.list" # DNA codon list
  25. open_reading_frames_init_path = "resources/PATTERNS/ORF/ORF-init.list" # ORF init list
  26. open_reading_frames_end_path = "resources/PATTERNS/ORF/ORF-end.list" # ORF end list
  27. genomes = {} # main sources dict: genome_name
  28. seeds_checked = [] # list used for random checked patterns
  29. repeats = {} # repetitions 'tmp' dict: genome_name:(repets,pattern)
  30. known_patterns = [] # list used for known patterns
  31. max_length = 50 # [MAX. LENGTH] for range [PATTERN]
  32. def convert_size(size):
  33. if (size == 0):
  34. return '0 B'
  35. size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
  36. i = int(math.floor(math.log(size,1024)))
  37. p = math.pow(1024,i)
  38. s = round(size/p,2)
  39. return s, size_name[i]
  40. def search_pattern_with_human():
  41. pattern = input("[HUMAN] [SEARCH] Pattern (ex: attacg): ").upper()
  42. print("\n"+"-"*5 + "\n")
  43. create_new_pattern(pattern) # create new pattern
  44. def try_pattern_against_all_genomes_by_genome(pattern):
  45. for k, v in genomes.items():
  46. if pattern in v:
  47. t = len(re.findall(pattern, v))
  48. repeats[k] = t, pattern # create dict: genome = times, pattern
  49. def try_pattern_against_all_genomes_by_pattern(pattern, index):
  50. p_index = 0 # pattern index
  51. for k, v in genomes.items():
  52. if pattern in v:
  53. p_index = p_index + 1
  54. t = len(re.findall(pattern, v))
  55. repeats[index,p_index] = pattern, k, t # create dict: index, p_index = pattern, genome, times
  56. def sanitize_dna_pattern(pattern):
  57. valid_pattern = True
  58. for c in pattern:
  59. if c == "A":
  60. pass
  61. elif c == "T":
  62. pass
  63. elif c == "G":
  64. pass
  65. elif c == "C":
  66. pass
  67. elif c == "N":
  68. pass
  69. else:
  70. valid_pattern = False
  71. return valid_pattern
  72. def teach_ai():
  73. mode = input("[TRAIN-AI] MODE -> (H)uman, (A)utomata: ").upper()
  74. if not os.path.isfile(brain_path):
  75. create_initial_seed_file()
  76. if mode == "H": # human mode
  77. teach_ai_human_mode()
  78. else: # libre AI
  79. teach_ai_automata_mode() # automata mode
  80. def teach_ai_human_mode(): # search/discard patterns with human interaction & generate local database
  81. search_patterns_lesson_with_a_human()
  82. def search_patterns_lesson_with_a_human():
  83. print("\n"+"-"*30)
  84. print("\n[TRAIN-AI] [HUMAN] [STOP] this mode; just entering whatever invalid pattern (ex: 'exit' or 'q').\n")
  85. key = "K" # continue
  86. while key == "K":
  87. pattern = input("[TRAIN-AI] [HUMAN] [LOOP] [SEARCH] Pattern (ex: attacg): ").upper()
  88. print("\n"+"-"*5 + "\n")
  89. key = search_pattern_on_lesson(pattern)
  90. if key == "Z": # stop
  91. break
  92. def search_pattern_on_lesson(pattern):
  93. valid_pattern = sanitize_dna_pattern(pattern)
  94. if valid_pattern == True:
  95. key = search_pattern_on_local_database(pattern) # search pattern on local database
  96. else:
  97. print("[ERROR] -> Invalid DNA pattern ... [EXITING!]\n")
  98. key = "Z" # stop
  99. return key
  100. def search_pattern_on_local_database(pattern):
  101. f=open(brain_path, 'r')
  102. memory = f.read().replace('\n',' ')
  103. f.close()
  104. patterns_known = 0
  105. if not "'"+pattern+"'" in memory: # always create new patterns
  106. create_new_pattern(pattern) # create new pattern
  107. patterns_known = patterns_known + 1
  108. else:
  109. for k, v in genomes.items(): # create patterns found for new genomes
  110. if k not in memory:
  111. create_new_pattern(pattern) # create new pattern
  112. patterns_known = patterns_known + 1
  113. if patterns_known == 0:
  114. print("[TRAIN-AI] [AUTOMATA] [LOOP] [RESULTS] -ALREADY- [LEARNED!] ... -> [GOING FOR NEXT!]\n")
  115. print("-"*5 + "\n")
  116. key = "K" # continue
  117. return key
  118. def create_initial_seed_file():
  119. f=open(brain_path, 'w')
  120. f.write("")
  121. f.close()
  122. def create_new_pattern(pattern): # append it to brain
  123. valid_pattern = sanitize_dna_pattern(pattern)
  124. if valid_pattern == True:
  125. if pattern not in known_patterns:
  126. known_patterns.append(pattern)
  127. try_pattern_against_all_genomes_by_genome(pattern) # generate repeats dict
  128. patterns_found = 0
  129. for k, v in repeats.items(): # list patterns found to output
  130. print (" *", k +":", "-> ",v,"")
  131. patterns_found = patterns_found + 1
  132. print("")
  133. if patterns_found == 0:
  134. print("[INFO] -> Not any found! ... [EXITING!]\n")
  135. else:
  136. f=open(brain_path, 'a')
  137. f.write(str(repeats)+os.linesep) # add dict as str
  138. f.close()
  139. else:
  140. print("[ERROR] -> Invalid DNA pattern ... [EXITING!]\n")
  141. def teach_ai_automata_mode(): # search patterns by bruteforcing ranges & generate local database
  142. search_patterns_lesson_with_an_ai()
  143. def search_patterns_lesson_with_an_ai():
  144. print("\n"+"-"*30)
  145. print("\n[TRAIN-AI] [AUTOMATA] [STOP] this mode; pressing 'CTRL+z'.\n")
  146. ranges = input("[TRAIN-AI] [AUTOMATA] [SEARCH] Set range (x<y) for pattern deep searching (ex: 2-8): ")
  147. print ("")
  148. valid_range, ranged_permutations = check_for_deep_searching_ranges(ranges)
  149. if str(valid_range) == "OK!":
  150. ranged_ending = False
  151. print("-"*15)
  152. print("\n[TRAIN-AI] [AUTOMATA] [SEARCH] Number of [PERMUTATIONS] estimated: [ "+str(ranged_permutations)+" ]\n")
  153. print("-"*15+"\n")
  154. num_pat = 0
  155. while ranged_ending == False: # try to STOP it using: CTRL-z
  156. try:
  157. pattern, ranged_ending = generate_random_pattern(ranges, ranged_permutations) # generate random seed
  158. if pattern:
  159. num_pat = num_pat + 1
  160. print("[TRAIN-AI] [AUTOMATA] [LOOP] [SEARCH] Generating [RANDOM!] ["+str(num_pat)+"/"+str(ranged_permutations)+"] pattern: [ " + str(pattern) + " ]\n")
  161. if not num_pat == ranged_permutations:
  162. search_pattern_on_lesson(pattern)
  163. else:
  164. search_pattern_on_lesson(pattern)
  165. print("[TRAIN-AI] [AUTOMATA] [RESULTS]: REVIEWED -> [ "+str(ranged_permutations)+" PERMUTATIONS ] ... -> [EXITING!]\n")
  166. ranged_ending = True
  167. except:
  168. pass
  169. else:
  170. print("-"*15+"\n")
  171. print("[TRAIN-AI] [AUTOMATA] [ERROR] -> [INVALID!] Deep Learning [RANGE] -> "+valid_range+" ... [EXITING!]\n")
  172. def generate_random_pattern(ranges, ranged_permutations):
  173. ranged_length = 0
  174. try:
  175. range_low = int(ranges.split("-")[0])
  176. range_high = int(ranges.split("-")[1])
  177. for i in range(range_low, range_high+1):
  178. ranged_length = ranged_length + 1
  179. if ranged_length == ranged_permutations: # all possible variables have been bruteforced/checked! -> exit
  180. pattern = None
  181. ranged_ending = True
  182. return pattern, ranged_ending
  183. else:
  184. ranged_ending = False
  185. seed = [random.randrange(0, 4) for _ in range(i)] # generate "random" seed
  186. if seed not in seeds_checked:
  187. seeds_checked.append(seed)
  188. pattern = ""
  189. for n in seed:
  190. if n == 0:
  191. pattern += "A"
  192. elif n == 1:
  193. pattern += "C"
  194. elif n == 2:
  195. pattern += "T"
  196. else:
  197. pattern += "G"
  198. return pattern, ranged_ending
  199. except:
  200. print("[TRAIN-AI] [AUTOMATA] [ERROR] -> [INVALID!] Deep Learning [RANGE] ... [EXITING!]\n")
  201. pattern = None
  202. ranged_ending = True
  203. return pattern, ranged_ending
  204. def check_for_deep_searching_ranges(ranges):
  205. try:
  206. range_low = ranges.split("-")[0]
  207. range_high = ranges.split("-")[1]
  208. except:
  209. valid_range = "'bad format'"
  210. try:
  211. range_low = int(range_low)
  212. except:
  213. valid_range = "'low range' should be an integer"
  214. try:
  215. range_high = int(range_high)
  216. except:
  217. valid_range = "'high range' should be an integer"
  218. try:
  219. if range_low < range_high:
  220. if range_low > 1: # always range > 1
  221. valid_range = "OK!"
  222. else:
  223. valid_range = "'low range' should be > than 1"
  224. else:
  225. valid_range = "'low range' should be < than 'high range'"
  226. except:
  227. valid_range = "'bad format'"
  228. try:
  229. ranged_permutations = math_ranged_permutations(range_low, range_high)
  230. except:
  231. ranged_permutations = 0
  232. valid_range = "'bad format'"
  233. return valid_range, ranged_permutations
  234. def math_ranged_permutations(range_low, range_high): # calculate ranged_permutations
  235. ranged_permutations = 0
  236. for i in range(range_low, range_high+1):
  237. ranged_permutations = ranged_permutations + (4**i)
  238. return ranged_permutations
  239. def libre_ai(): # show statistics / download new genomes / keep crossing new genomes with local database / search for new patterns (non stop!)
  240. if not os.path.isfile(brain_path):
  241. create_initial_seed_file()
  242. memory = examine_stored_brain_memory()
  243. if memory != "":
  244. #print("[LIBRE-AI] [STOP] this mode; pressing 'CTRL+z'.\n")
  245. libre_ai_show_statistics(memory) # show statistics
  246. def libre_ai_show_statistics(memory):
  247. print("[LIBRE-AI] [REPORTING] [STATISTICS] ... -> [STARTING!]\n")
  248. print("-"*15 + "\n")
  249. total_genomes = 0
  250. total_adenine = 0
  251. total_guanine = 0
  252. total_cytosine = 0
  253. total_thymine = 0
  254. total_any = 0
  255. total_patterns = 0
  256. secuence_length = 0
  257. secuences_length_list = {}
  258. largest = None
  259. largest_len = 0
  260. shortest_len = 0
  261. average = None
  262. shortest = None
  263. for k, v in genomes.items():
  264. secuence_length = len(v)
  265. secuences_length_list[k] = str(secuence_length)
  266. total_genomes = total_genomes + 1
  267. total_adenine = total_adenine + v.count("A")
  268. total_guanine = total_guanine + v.count("G")
  269. total_cytosine = total_cytosine + v.count("C")
  270. total_thymine = total_thymine + v.count("T")
  271. total_any = total_any + v.count("N")
  272. path = genomes_path # genome datasets raw data
  273. l = glob.glob(genomes_path+"*") # black magic!
  274. latest_collection_file = max(l, key=os.path.getctime)
  275. latest_collection_date = time.ctime(os.path.getmtime(latest_collection_file))
  276. total_nucleotids = [total_adenine, total_guanine, total_cytosine, total_thymine, total_any]
  277. num_total_nucleotids = total_adenine + total_guanine + total_cytosine + total_thymine + total_any
  278. nucleotid_more_present = max(total_nucleotids)
  279. print("[LIBRE-AI] [REPORTING] -STORAGE- [STATISTICS]: \n")
  280. extract_storage_sizes()
  281. print(" * [LATEST UPDATE]: '"+str(latest_collection_date)+"'\n")
  282. print(" + File: '"+str(latest_collection_file)+"'\n")
  283. print("-"*5 + "\n")
  284. print("[LIBRE-AI] [REPORTING] -COLLECTION- [STATISTICS]: \n")
  285. extract_total_patterns_learned_from_local(memory)
  286. print("\n"+"-"*5 + "\n")
  287. print("[LIBRE-AI] [REPORTING] -ANALYSIS- [STATISTICS]: \n")
  288. print(" * Total [DNA SECUENCES]: [ "+str(total_genomes)+" ]\n")
  289. largest = 0
  290. largest_pattern_name = []
  291. largest_pattern_size = []
  292. for k, v in secuences_length_list.items():
  293. if int(v) > int(largest):
  294. largest = v
  295. largest_pattern_name.append(k)
  296. largest_pattern_size.append(largest)
  297. for p in largest_pattern_name:
  298. largest_pattern_name = p
  299. for s in largest_pattern_size:
  300. largest_pattern_size = s
  301. print(" + [LARGEST] : "+str(largest_pattern_name)+ " [ "+str(largest_pattern_size)+" bp linear RNA ]")
  302. prev_shortest = None
  303. shortest_pattern_name = []
  304. shortest_pattern_size = []
  305. for k, v in secuences_length_list.items():
  306. if prev_shortest == None:
  307. shortest = v
  308. shortest_pattern_name.append(k)
  309. shortest_pattern_size.append(shortest)
  310. prev_shortest = True
  311. else:
  312. if int(v) < int(shortest):
  313. shortest = v
  314. shortest_pattern_name.append(k)
  315. shortest_pattern_size.append(shortest)
  316. for p in shortest_pattern_name:
  317. shortest_pattern_name = p
  318. for s in shortest_pattern_size:
  319. shortest_pattern_size = s
  320. print(" + [SHORTEST]: "+str(shortest_pattern_name)+ " [ "+str(shortest_pattern_size)+" bp linear RNA ]\n")
  321. print(" * Total [NUCLEOTIDS]: [ "+str(num_total_nucleotids)+" ]\n")
  322. if nucleotid_more_present == total_adenine:
  323. print(" + [A] Adenine : "+str(total_adenine)+" <- [MAX]")
  324. else:
  325. print(" + [A] Adenine : "+str(total_adenine))
  326. if nucleotid_more_present == total_guanine:
  327. print(" + [G] Guanine : "+str(total_guanine)+" <- [MAX]")
  328. else:
  329. print(" + [G] Guanine : "+str(total_guanine))
  330. if nucleotid_more_present == total_cytosine:
  331. print(" + [C] Cytosine : "+str(total_cytosine)+" <- [MAX]")
  332. else:
  333. print(" + [C] Cytosine : "+str(total_cytosine))
  334. if nucleotid_more_present == total_thymine:
  335. print(" + [T] Thymine : "+str(total_thymine)+" <- [MAX]")
  336. else:
  337. print(" + [T] Thymine : "+str(total_thymine))
  338. if total_any > 0:
  339. if nucleotid_more_present == total_any:
  340. print(" + [N] *ANY* : "+str(total_any)+" <- [MAX]\n")
  341. else:
  342. print(" + [N] *ANY* : "+str(total_any)+"\n")
  343. print("-"*5 + "\n")
  344. extract_pattern_most_present_local(memory)
  345. def convert_memory_to_dict(memory): # [index] = genome_name, pattern, num_rep
  346. memory_dict = {}
  347. index = 0
  348. for m in memory:
  349. regex_record = "'(.+?)': (.+?), '(.+?)'" # regex magics! - extract first each record
  350. pattern_record = re.compile(regex_record)
  351. record = re.findall(pattern_record, m)
  352. for r in record: # now extract each field
  353. index = index + 1
  354. name = str(r).split("', '(")[0]
  355. genome_name = str(name).split("'")[1]
  356. repeats = str(r).split("', '(")[1]
  357. genome_repeats = str(repeats).split("',")[0]
  358. pattern = str(repeats).split("',")[1]
  359. genome_pattern = pattern.replace(" ", "")
  360. genome_pattern = genome_pattern.replace("'", "")
  361. genome_pattern = genome_pattern.replace(")", "")
  362. memory_dict[index] = genome_name, genome_pattern, genome_repeats # generate memory_dict!
  363. return memory_dict
  364. def extract_pattern_most_present_local(memory):
  365. memory_dict = convert_memory_to_dict(memory)
  366. if genomes:
  367. try:
  368. f=open(dna_codons_list_path, 'r')
  369. codons = f.readlines()
  370. f.close()
  371. except:
  372. pass
  373. print("[LIBRE-AI] [REPORTING] -RESEARCHING- [STATISTICS]: \n")
  374. total_genomes = 0
  375. for k, v in genomes.items():
  376. total_genomes = total_genomes + 1
  377. if memory_dict:
  378. total_patterns = 0
  379. for m in memory:
  380. total_patterns = total_patterns + 1 # counter used for known patterns
  381. max_size_pattern_name, less_size_pattern_name, biggest_pattern_name, biggest_pattern_size, smaller_pattern_name, smaller_pattern_size, total_patterns_all_genomes, most_present_patterns_by_len_list, less_present_patterns_by_len_list = extract_patterns_most_found_in_all_genomes(memory_dict)
  382. print(" * Searching -[ "+str(total_patterns)+" ]- [PATTERNS LEARNED!] in -[ "+str(total_genomes)+ " ]- [DNA SECUENCES]:")
  383. if total_patterns_all_genomes:
  384. print("\n + Total [PATTERNS FOUND!]: [ "+str(total_patterns_all_genomes)+" ]")
  385. biggest_pattern_name_codon = None
  386. for c in codons:
  387. if c.split(":")[0] == str(biggest_pattern_name):
  388. biggest_pattern_name_codon = str(c.split(":")[1].replace("\n",""))
  389. print("\n - [MOST-PRESENT!]: [ "+str(biggest_pattern_size)+" ] time(s) -> [ "+str(biggest_pattern_name)+" ] "+str(biggest_pattern_name_codon)+"\n")
  390. if biggest_pattern_name_codon == None:
  391. print("\n - [MOST-PRESENT!]: [ "+str(biggest_pattern_size)+" ] time(s) -> [ "+str(biggest_pattern_name)+" ]\n")
  392. other_pattern_name_codon = None
  393. for k, v in most_present_patterns_by_len_list.items():
  394. for c in codons:
  395. if c.split(":")[0] == str(v[0]):
  396. other_pattern_name_codon = str(c.split(":")[1].replace("\n",""))
  397. print(" * [length = "+str(k)+"] : [ "+str(v[1])+" ] time(s) -> [ "+str(v[0])+" ] "+str(other_pattern_name_codon))
  398. if other_pattern_name_codon == None:
  399. print(" * [length = "+str(k)+"] : [ "+str(v[1])+" ] time(s) -> [ "+str(v[0])+" ]")
  400. other_pattern_name_codon = None
  401. smaller_pattern_name_codon = None
  402. for c in codons:
  403. if c.split(":")[0] == str(smaller_pattern_name):
  404. smaller_pattern_name_codon = str(c.split(":")[1].replace("\n",""))
  405. print("\n - [LESS-PRESENT!]: [ "+str(smaller_pattern_size)+" ] time(s) -> [ "+str(smaller_pattern_name)+" ] "+str(smaller_pattern_name_codon)+"\n")
  406. if smaller_pattern_name_codon == None:
  407. print("\n - [LESS-PRESENT!]: [ "+str(smaller_pattern_size)+" ] time(s) -> [ "+str(smaller_pattern_name)+" ]\n")
  408. other_pattern_name_codon = None
  409. for n, m in less_present_patterns_by_len_list.items():
  410. for c in codons:
  411. if c.split(":")[0] == str(m[0]):
  412. other_pattern_name_codon = str(c.split(":")[1].replace("\n",""))
  413. print(" * [length = "+str(n)+"] : [ "+str(m[1])+" ] time(s) -> [ "+str(m[0])+" ] "+str(other_pattern_name_codon))
  414. if other_pattern_name_codon == None:
  415. print(" * [length = "+str(n)+"] : [ "+str(m[1])+" ] time(s) -> [ "+str(m[0])+" ]")
  416. other_pattern_name_codon = None
  417. max_size_pattern_name = max(most_present_patterns_by_len_list, key=most_present_patterns_by_len_list.get)
  418. less_size_pattern_name = min(most_present_patterns_by_len_list, key=most_present_patterns_by_len_list.get)
  419. print("\n - [LARGEST] : [ "+str(max_size_pattern_name)+" ] bp linear RNA")
  420. print(" - [SHORTEST]: [ "+str(less_size_pattern_name)+" ] bp linear RNA\n")
  421. else:
  422. print("\n + Total [PATTERNS FOUND!]: [ 0 ]\n")
  423. try:
  424. f=open(universal_primer_list_path, 'r')
  425. UPL = f.readlines()
  426. f.close()
  427. if UPL:
  428. extract_potential_primer_pairs(UPL, total_genomes, codons)
  429. except:
  430. pass
  431. if codons:
  432. extract_potential_dna_codons(codons, total_genomes)
  433. def extract_potential_primer_pairs(UPL, total_genomes, codons):
  434. total_universal_primer_pairs = 0
  435. total_primer_pairs_found = 0
  436. primer_pairs_found_list = {}
  437. for pp in UPL:
  438. total_universal_primer_pairs = total_universal_primer_pairs + 1
  439. for k, v in genomes.items():
  440. pair_name = pp.split(":")[1].upper().replace("\n","")
  441. pair_sec = pp.split(":")[0]
  442. if str(pair_name) in str(v.upper()):
  443. pair_times = v.count(pair_name)
  444. total_primer_pairs_found += pair_times
  445. primer_pairs_found_list[pair_sec] = pair_name, total_primer_pairs_found
  446. print(" * Searching -[ "+str(total_universal_primer_pairs)+" ]- [UNIVERSAL PRIMER PAIRS!] in -[ "+str(total_genomes)+ " ]- [DNA SECUENCES]:")
  447. if total_primer_pairs_found:
  448. total_primer_pairs_found_list = 0
  449. for m, n in primer_pairs_found_list.items():
  450. total_primer_pairs_found_list = total_primer_pairs_found_list + n[1]
  451. print("\n + Total [UNIVERSAL PRIMER PAIRS FOUND!]: [ "+str(total_primer_pairs_found_list)+" ]\n")
  452. for m, n in primer_pairs_found_list.items():
  453. print(" * "+str(m)+" -> [ "+str(n[0])+" ] : [ "+str(n[1])+" ] time(s)")
  454. print ("")
  455. else:
  456. print("\n + Total [UNIVERSAL PRIMER PAIRS FOUND!]: [ 0 ]\n")
  457. def extract_potential_dna_codons(codons, total_genomes):
  458. total_codons = 0
  459. total_codons_found = 0
  460. codons_found_list = {}
  461. codons_found_list_by_codon = {}
  462. index = 0
  463. for c in codons:
  464. total_codons = total_codons + 1
  465. for k, v in genomes.items():
  466. codon_name = c.split(":")[0].upper().replace("\n","")
  467. if str(codon_name) in str(v.upper()):
  468. index = index + 1
  469. codons_times = v.count(codon_name)
  470. total_codons_found += codons_times
  471. codons_found_list[index] = codons_times, c.split(":")[0], str(c.split(":")[1]), k
  472. print(" * Searching -[ "+str(total_codons)+" ]- [PATTERN CODONS!] in -[ "+str(total_genomes)+ " ]- [DNA SECUENCES]:")
  473. if total_codons_found:
  474. for m, n in codons_found_list.items():
  475. codon_sec = str(n[1])
  476. codon_name = str(n[2].replace("\n",""))
  477. if not codon_sec in codons_found_list_by_codon.keys():
  478. codons_found_list_by_codon[codon_sec] = codon_name, m
  479. else:
  480. for r, s in codons_found_list_by_codon.items():
  481. if codon_sec == r:
  482. new_v = s[1] + m
  483. codons_found_list_by_codon[codon_sec] = codon_name, new_v
  484. codons_found_list_by_name = {}
  485. for g,z in codons_found_list_by_codon.items():
  486. if not z[0] in codons_found_list_by_name.keys():
  487. codons_found_list_by_name[z[0]]= z[1]
  488. else:
  489. for e, q in codons_found_list_by_name.items():
  490. if z[0] == e:
  491. new_s = q + z[1]
  492. codons_found_list_by_name[z[0]] = new_s
  493. total_codons_by_codon = 0
  494. for p, f in codons_found_list_by_name.items():
  495. total_codons_by_codon = total_codons_by_codon + f
  496. print("\n + Total [PATTERN CODONS FOUND!]: [ "+str(total_codons_by_codon)+" ]\n")
  497. most_present_codons_found = max(codons_found_list_by_name, key=codons_found_list_by_name.get)
  498. less_present_codons_found = min(codons_found_list_by_name, key=codons_found_list_by_name.get)
  499. print(" - [MOST-PRESENT!]: "+str(most_present_codons_found))
  500. print(" - [LESS-PRESENT!]: "+str(less_present_codons_found)+"\n")
  501. for p, f in codons_found_list_by_name.items():
  502. print(" * "+str(p)+" : "+str(f)+" time(s)")
  503. print ("")
  504. else:
  505. print("\n + Total [PATTERN CODONS FOUND!]: [ 0 ]\n")
  506. if codons_found_list:
  507. extract_open_reading_frames(total_genomes)
  508. def extract_open_reading_frames(total_genomes):
  509. try:
  510. f=open(open_reading_frames_init_path, 'r')
  511. frames_init = f.readlines()
  512. f.close()
  513. except:
  514. pass
  515. try:
  516. e=open(open_reading_frames_end_path, 'r')
  517. frames_end = e.readlines()
  518. e.close()
  519. except:
  520. pass
  521. if frames_init and frames_end:
  522. print(" * Searching for [OPEN READING FRAMES!] in -[ "+str(total_genomes)+ " ]- [DNA SECUENCES]:")
  523. total_opr_found = 0
  524. r_found_by_pattern = 0
  525. opr_found_list = {}
  526. index = 0
  527. for k, v in genomes.items():
  528. for opr_i in frames_init:
  529. opr_init_name = opr_i.replace("\n","")
  530. if str(opr_init_name) in str(v.upper()): # open reading INIT frame found!
  531. for opr_e in frames_end:
  532. opr_end_name = opr_e.replace("\n","")
  533. if str(opr_end_name) in str(v.upper()): # open reading END frame found!
  534. regex_opr = str(opr_init_name) +"(.+?)"+str(opr_end_name) # regex magics! - extract secuence between ocr_i and ocr_e
  535. pattern_record = re.compile(regex_opr)
  536. record = re.findall(pattern_record, str(v.upper()))
  537. for r in record: # now extract each field
  538. total_opr_found = total_opr_found + 1
  539. r_found_by_pattern = v.count(opr_init_name+r+opr_end_name)
  540. index = index + 1
  541. opr_found_list[index] = k, r_found_by_pattern, opr_init_name, r, opr_end_name # [index]: genome, num_times, opr_i, pattern, opr_e
  542. if total_opr_found > 0:
  543. print("\n + Total [OPEN READING FRAMES FOUND!]: [ "+str(total_opr_found)+" ]\n")
  544. most_present_opr_found = max(opr_found_list, key=opr_found_list.get)
  545. largest_pattern = 0
  546. largest_pattern_found = None
  547. for m, n in opr_found_list.items():
  548. opr_found_init = str(n[2])
  549. opr_found_pattern = str(n[3])
  550. opr_found_end = str(n[4])
  551. opr_found_times = str(n[1])
  552. opr_found_genome = str(n[0])
  553. opr_found_pattern_len = len(opr_found_pattern)
  554. if opr_found_pattern_len > largest_pattern:
  555. largest_pattern = opr_found_pattern_len
  556. largest_pattern_found = opr_found_init, opr_found_pattern, opr_found_end, opr_found_genome
  557. if m == most_present_opr_found:
  558. most_present_opr_found_init = str(n[2])
  559. most_present_opr_found_pattern = str(n[3])
  560. most_present_opr_found_end = str(n[4])
  561. most_present_opr_found_times = str(n[1])
  562. most_present_opr_found_genome = str(n[0])
  563. print(" - [MOST-PRESENT!]: [ "+str(most_present_opr_found_times)+" ] time(s) found in [ "+str(most_present_opr_found_genome)+" ] is -> [ "+str(most_present_opr_found_init)+"-{?}-"+str(most_present_opr_found_end)+" ]:\n")
  564. print(str(" * "+str(most_present_opr_found_init+most_present_opr_found_pattern+most_present_opr_found_end)))
  565. print("\n - [LARGEST]: [ "+str(len(largest_pattern_found[1]))+" bp linear RNA ] found in [ "+str(largest_pattern_found[3])+" ] is -> [ "+str(largest_pattern_found[0])+"-{?}-"+str(largest_pattern_found[2])+" ]:\n")
  566. print(str(" * "+str(largest_pattern_found[0]+largest_pattern_found[1]+largest_pattern_found[2])+"\n"))
  567. else:
  568. print("\n + Total [OPEN READING FRAMES FOUND!]: [ 0 ]\n")
  569. else:
  570. print("\n + Total [OPEN READING FRAMES FOUND!]: [ 0 ]\n")
  571. def extract_patterns_most_found_in_all_genomes(memory_dict):
  572. present_patterns = []
  573. for m, p in memory_dict.items():
  574. pattern = p[1]
  575. if pattern not in present_patterns:
  576. present_patterns.append(pattern)
  577. index = 0 # genome num index
  578. for pattern in present_patterns:
  579. index = index + 1
  580. try_pattern_against_all_genomes_by_pattern(pattern, index)
  581. total_patterns_all_genomes = 0
  582. largest_size_by_pattern = {}
  583. largest_size_by_pattern_index = 0
  584. for k,v in repeats.items():
  585. largest_size_by_pattern_index = largest_size_by_pattern_index + 1
  586. largest_size_by_pattern[largest_size_by_pattern_index] = v[0], v[2]
  587. total_patterns_by_pattern = 0
  588. list_total_patterns_by_pattern = {}
  589. for i, v in largest_size_by_pattern.items():
  590. total_patterns_by_pattern = total_patterns_by_pattern + v[1]
  591. list_total_patterns_by_pattern[v[0]] = total_patterns_by_pattern
  592. biggest_pattern_name = None
  593. biggest_pattern_size = 0
  594. smaller_pattern_name = None
  595. smaller_pattern_size = 0
  596. max_size_pattern = 0
  597. for r, z in list_total_patterns_by_pattern.items():
  598. total_patterns_all_genomes = total_patterns_all_genomes + z
  599. pattern_length = len(r)
  600. if pattern_length > max_size_pattern:
  601. max_size_pattern_name = r
  602. if biggest_pattern_name == None:
  603. biggest_pattern_name = r
  604. smaller_pattern_name = r
  605. biggest_pattern_size = z
  606. smaller_pattern_size = z
  607. less_size_pattern_name = r
  608. less_size_pattern_size = z
  609. else:
  610. if pattern_length < less_size_pattern_size:
  611. less_size_pattern_size = pattern_length
  612. less_size_pattern_name = r
  613. if z > biggest_pattern_size:
  614. biggest_pattern_name = r
  615. biggest_pattern_size = z
  616. else:
  617. if z < smaller_pattern_size:
  618. smaller_pattern_name = r
  619. smaller_pattern_size = z
  620. most_present_patterns_by_len_list = extract_most_present_pattern_by_len(list_total_patterns_by_pattern)
  621. less_present_patterns_by_len_list = extract_less_present_pattern_by_len(list_total_patterns_by_pattern)
  622. return max_size_pattern_name, less_size_pattern_name, biggest_pattern_name, biggest_pattern_size, smaller_pattern_name, smaller_pattern_size, total_patterns_all_genomes, most_present_patterns_by_len_list, less_present_patterns_by_len_list
  623. def extract_most_present_pattern_by_len(list_total_patterns_by_pattern):
  624. most_present_patterns_by_len_list = {}
  625. for k, v in list_total_patterns_by_pattern.items():
  626. pattern_len = len(k)
  627. if pattern_len in most_present_patterns_by_len_list.keys():
  628. if v > most_present_patterns_by_len_list[pattern_len][1]:
  629. most_present_patterns_by_len_list[pattern_len] = k, v
  630. else:
  631. most_present_patterns_by_len_list[pattern_len] = k, v
  632. return most_present_patterns_by_len_list
  633. def extract_less_present_pattern_by_len(list_total_patterns_by_pattern):
  634. less_present_patterns_by_len_list = {}
  635. for k, v in list_total_patterns_by_pattern.items():
  636. pattern_len = len(k)
  637. if pattern_len in less_present_patterns_by_len_list.keys():
  638. if v < less_present_patterns_by_len_list[pattern_len][1]:
  639. less_present_patterns_by_len_list[pattern_len] = k, v
  640. else:
  641. less_present_patterns_by_len_list[pattern_len] = k, v
  642. return less_present_patterns_by_len_list
  643. def extract_storage_sizes():
  644. total_dataset_size = 0
  645. total_files_size = 0
  646. total_list_size = 0
  647. for file in glob.iglob(genomes_path + '*/*/*', recursive=True): # extract datasets sizes
  648. if(file.endswith(".genome")):
  649. total_dataset_size = total_dataset_size + len(file)
  650. try:
  651. f=open(brain_path, "r") # extract brain sizes
  652. total_brain_size = len(f.read())
  653. f.close()
  654. except:
  655. total_brain_size = 0
  656. try:
  657. f=open(genomes_list_path, "r") # extract genomes list sizes
  658. total_list_size = len(f.read())
  659. f.close()
  660. except:
  661. total_list_size = 0
  662. if total_dataset_size > 0:
  663. total_files_size = int(total_files_size) + int(total_dataset_size)
  664. dataset_s, dataset_size_name = convert_size(total_dataset_size)
  665. total_dataset_size = '%s %s' % (dataset_s,dataset_size_name)
  666. if total_brain_size > 0:
  667. total_files_size = int(total_files_size) + int(total_brain_size)
  668. brain_s, brain_size_name = convert_size(total_brain_size)
  669. total_brain_size = '%s %s' % (brain_s,brain_size_name)
  670. if total_list_size > 0:
  671. total_files_size = int(total_files_size) + int(total_list_size)
  672. list_s, list_size_name = convert_size(total_list_size)
  673. total_list_size = '%s %s' % (list_s,list_size_name)
  674. total_s, total_size_name = convert_size(total_files_size)
  675. total_files_size = '%s %s' % (total_s,total_size_name)
  676. print(" * Total [FILE SIZES]: "+str(total_files_size)+"\n")
  677. if total_dataset_size:
  678. print(" + [DATASET]: "+str(total_dataset_size)+"\n")
  679. if total_list_size:
  680. print(" + [LIST]: "+str(total_list_size)+"\n")
  681. if total_brain_size:
  682. print(" + [BRAIN]: "+str(total_brain_size)+"\n")
  683. def extract_total_patterns_learned_from_local(memory):
  684. total_patterns = 0
  685. for m in memory:
  686. total_patterns = total_patterns + 1
  687. print(" * [SETTINGS] Using [MAX. LENGTH] for range [PATTERN] = [ "+str(max_length)+" ]\n")
  688. if total_patterns > 0:
  689. print(" + [PATTERNS LEARNED!]: [ "+str(total_patterns)+" ]\n")
  690. else:
  691. print(" + [PATTERNS LEARNED!]: [ "+str(total_patterns)+" ]")
  692. generate_pattern_len_report_structure(memory)
  693. return memory
  694. def list_genomes_on_database():
  695. print("[LIST] [REPORTING] [DNA SECUENCES] ... -> [STARTING!]\n")
  696. f=open(dna_codons_list_path, 'r')
  697. codons = f.readlines()
  698. f.close()
  699. print("-"*15 + "\n")
  700. f=open(open_reading_frames_init_path, 'r')
  701. frames_init = f.readlines()
  702. f.close()
  703. f=open(open_reading_frames_end_path, 'r')
  704. frames_end = f.readlines()
  705. f.close()
  706. f=open(genomes_list_path, 'w')
  707. for k, v in genomes.items():
  708. print ("* "+str(k))
  709. print ("\n + Total [NUCLEOTIDS]: [ "+str(len(v)-1)+" bp linear RNA ]\n")
  710. print (" - [A] Adenine :", str(v.count("A")))
  711. print (" - [G] Guanine :", str(v.count("G")))
  712. print (" - [C] Cytosine :", str(v.count("C")))
  713. print (" - [T] Thymine :", str(v.count("T")))
  714. f.write(str("* "+str(k)+"\n"))
  715. f.write(str("\n + Total [NUCLEOTIDS]: [ "+str(len(v)-1)+" bp linear RNA ]\n"))
  716. f.write(str(" - [A] Adenine : " + str(v.count("A"))+"\n"))
  717. f.write(str(" - [G] Guanine : " + str(v.count("G"))+"\n"))
  718. f.write(str(" - [C] Cytosine : " + str(v.count("C"))+"\n"))
  719. f.write(str(" - [T] Thymine : " + str(v.count("T"))+"\n"))
  720. if v.count("N") > 0:
  721. print (" - [N] *ANY* :", str(v.count("N")))
  722. f.write(str(" - [N] *ANY* : "+ str(v.count("N"))+"\n"))
  723. total_codons = 0
  724. for c in codons:
  725. codon_counter = v.count(str(c.split(":")[0]))
  726. total_codons = total_codons + codon_counter
  727. print ("\n + Total [PATTERN CODONS!]: [ "+str(total_codons)+" ] time(s)\n")
  728. f.write(str("\n + Total [PATTERN CODONS!]: [ "+str(total_codons)+" ] time(s)\n"))
  729. for c in codons:
  730. codon_sec = str(c.split(":")[0])
  731. codon_name = str(c.split(":")[1].replace("\n",""))
  732. codon_counter = str(v.count(str(c.split(":")[0])))
  733. print (" - ["+codon_sec+"] "+codon_name+" :", codon_counter)
  734. f.write(str(" - ["+codon_sec+"] "+codon_name+" : "+ codon_counter)+"\n")
  735. if frames_init and frames_end:
  736. total_opr_found = 0
  737. r_found_by_pattern = 0
  738. opr_found_list = {}
  739. index = 0
  740. for opr_i in frames_init:
  741. opr_init_name = opr_i.replace("\n","")
  742. if str(opr_init_name) in str(v.upper()): # open reading INIT frame found!
  743. for opr_e in frames_end:
  744. opr_end_name = opr_e.replace("\n","")
  745. if str(opr_end_name) in str(v.upper()): # open reading END frame found!
  746. regex_opr = str(opr_init_name) +"(.+?)"+str(opr_end_name) # regex magics! - extract secuence between ocr_i and ocr_e
  747. pattern_record = re.compile(regex_opr)
  748. record = re.findall(pattern_record, str(v.upper()))
  749. for r in record: # now extract each field
  750. total_opr_found = total_opr_found + 1
  751. r_found_by_pattern = v.count(opr_init_name+r+opr_end_name)
  752. index = index + 1
  753. opr_found_list[index] = k, r_found_by_pattern, opr_init_name, r, opr_end_name # [index]: genome, num_times, opr_i, pattern, opr_e
  754. print ("\n + Total [OPEN READING FRAMES!]: [ "+str(total_opr_found)+" ] \n")
  755. f.write(str("\n + Total [OPEN READING FRAMES!]: [ "+str(total_opr_found)+" ] \n"))
  756. for m, n in opr_found_list.items():
  757. print(" - ["+str(n[2])+str(n[3])+str(n[4])+"] : [ "+str(n[1])+" ] time(s)")
  758. f.write(str(" - ["+str(n[2])+str(n[3])+str(n[4])+"] : "+ str(n[1]))+"\n")
  759. print ("")
  760. f.write("\n")
  761. print("-"*15 + "\n")
  762. print ("[LIST] [INFO] [SAVED!] at: '"+str(genomes_list_path)+"'... -> [EXITING!]\n")
  763. f.close()
  764. def examine_stored_brain_memory():
  765. memory = [] # list used as hot-memory
  766. f=open(brain_path, 'r')
  767. for line in f.readlines():
  768. if line not in memory:
  769. memory.append(line)
  770. f.close()
  771. if memory == "": # first time run!
  772. print ("[LIBRE-AI] [INFO] Not any [BRAIN] present ... -> [BUILDING ONE!]\n")
  773. print("-"*15 + "\n")
  774. for i in range(2, 11+1):
  775. seed = [random.randrange(0, 4) for _ in range(i)] # generate "static" genesis seed
  776. if seed not in seeds_checked:
  777. seeds_checked.append(seed)
  778. pattern = ""
  779. for n in seed:
  780. if n == 0:
  781. pattern += "A"
  782. elif n == 1:
  783. pattern += "C"
  784. elif n == 2:
  785. pattern += "T"
  786. else:
  787. pattern += "G"
  788. print("[LIBRE-AI] [SEARCH] Generating [RANDOM] pattern: " + str(pattern) + "\n")
  789. create_new_pattern(pattern) # create new pattern
  790. print("-"*15 + "\n")
  791. print ("[LIBRE-AI] [INFO] A new [BRAIN] has been created !!! ... -> [ADVANCING!]\n")
  792. f=open(brain_path, 'r')
  793. memory = f.read().replace('\n',' ')
  794. f.close()
  795. return memory
  796. def generate_pattern_len_report_structure(memory):
  797. pattern_len_1 = 0 # related with [MAX. LENGTH] range
  798. pattern_len_2 = 0
  799. pattern_len_3 = 0
  800. pattern_len_4 = 0
  801. pattern_len_5 = 0
  802. pattern_len_6 = 0
  803. pattern_len_7 = 0
  804. pattern_len_8 = 0
  805. pattern_len_9 = 0
  806. pattern_len_10 = 0
  807. pattern_len_11 = 0
  808. pattern_len_12 = 0
  809. pattern_len_13 = 0
  810. pattern_len_14 = 0
  811. pattern_len_15 = 0
  812. pattern_len_16 = 0
  813. pattern_len_17 = 0
  814. pattern_len_18 = 0
  815. pattern_len_19 = 0
  816. pattern_len_20 = 0
  817. pattern_len_21 = 0
  818. pattern_len_22 = 0
  819. pattern_len_23 = 0
  820. pattern_len_24 = 0
  821. pattern_len_25 = 0
  822. pattern_len_26 = 0
  823. pattern_len_27 = 0
  824. pattern_len_28 = 0
  825. pattern_len_29 = 0
  826. pattern_len_30 = 0
  827. pattern_len_31 = 0
  828. pattern_len_32 = 0
  829. pattern_len_33 = 0
  830. pattern_len_34 = 0
  831. pattern_len_35 = 0
  832. pattern_len_36 = 0
  833. pattern_len_37 = 0
  834. pattern_len_38 = 0
  835. pattern_len_39 = 0
  836. pattern_len_40 = 0
  837. pattern_len_41 = 0
  838. pattern_len_42 = 0
  839. pattern_len_43 = 0
  840. pattern_len_44 = 0
  841. pattern_len_45 = 0
  842. pattern_len_46 = 0
  843. pattern_len_47 = 0
  844. pattern_len_48 = 0
  845. pattern_len_49 = 0
  846. pattern_len_50 = 0
  847. for m in memory:
  848. try:
  849. pattern_len = m.split(", '")[1]
  850. pattern_len = pattern_len.split("')")[0]
  851. pattern_len = len(pattern_len)
  852. except:
  853. pattern_len = 0 # discard!
  854. if pattern_len == 1:
  855. pattern_len_1 = pattern_len_1 + 1
  856. elif pattern_len == 2:
  857. pattern_len_2 = pattern_len_2 + 1
  858. elif pattern_len == 3:
  859. pattern_len_3 = pattern_len_3 + 1
  860. elif pattern_len == 4:
  861. pattern_len_4 = pattern_len_4 + 1
  862. elif pattern_len == 5:
  863. pattern_len_5 = pattern_len_5 + 1
  864. elif pattern_len == 6:
  865. pattern_len_6 = pattern_len_6 + 1
  866. elif pattern_len == 7:
  867. pattern_len_7 = pattern_len_7 + 1
  868. elif pattern_len == 8:
  869. pattern_len_8 = pattern_len_8 + 1
  870. elif pattern_len == 9:
  871. pattern_len_9 = pattern_len_9 + 1
  872. elif pattern_len == 10:
  873. pattern_len_10 = pattern_len_10 + 1
  874. elif pattern_len == 11:
  875. pattern_len_11 = pattern_len_11 + 1
  876. elif pattern_len == 12:
  877. pattern_len_12 = pattern_len_12 + 1
  878. elif pattern_len == 13:
  879. pattern_len_13 = pattern_len_13 + 1
  880. elif pattern_len == 14:
  881. pattern_len_14 = pattern_len_14 + 1
  882. elif pattern_len == 15:
  883. pattern_len_15 = pattern_len_15 + 1
  884. elif pattern_len == 16:
  885. pattern_len_16 = pattern_len_16 + 1
  886. elif pattern_len == 17:
  887. pattern_len_17 = pattern_len_17 + 1
  888. elif pattern_len == 18:
  889. pattern_len_18 = pattern_len_18 + 1
  890. elif pattern_len == 19:
  891. pattern_len_19 = pattern_len_19 + 1
  892. elif pattern_len == 20:
  893. pattern_len_20 = pattern_len_20 + 1
  894. elif pattern_len == 21:
  895. pattern_len_21 = pattern_len_21 + 1
  896. elif pattern_len == 22:
  897. pattern_len_22 = pattern_len_22 + 1
  898. elif pattern_len == 23:
  899. pattern_len_23 = pattern_len_23 + 1
  900. elif pattern_len == 24:
  901. pattern_len_24 = pattern_len_24 + 1
  902. elif pattern_len == 25:
  903. pattern_len_25 = pattern_len_25 + 1
  904. elif pattern_len == 26:
  905. pattern_len_26 = pattern_len_26 + 1
  906. elif pattern_len == 27:
  907. pattern_len_27 = pattern_len_27 + 1
  908. elif pattern_len == 28:
  909. pattern_len_28 = pattern_len_28 + 1
  910. elif pattern_len == 29:
  911. pattern_len_29 = pattern_len_29 + 1
  912. elif pattern_len == 30:
  913. pattern_len_30 = pattern_len_30 + 1
  914. elif pattern_len == 31:
  915. pattern_len_31 = pattern_len_31 + 1
  916. elif pattern_len == 32:
  917. pattern_len_32 = pattern_len_32 + 1
  918. elif pattern_len == 33:
  919. pattern_len_33 = pattern_len_33 + 1
  920. elif pattern_len == 34:
  921. pattern_len_34 = pattern_len_34 + 1
  922. elif pattern_len == 35:
  923. pattern_len_35 = pattern_len_35 + 1
  924. elif pattern_len == 36:
  925. pattern_len_36 = pattern_len_36 + 1
  926. elif pattern_len == 37:
  927. pattern_len_37 = pattern_len_37 + 1
  928. elif pattern_len == 38:
  929. pattern_len_38 = pattern_len_38 + 1
  930. elif pattern_len == 39:
  931. pattern_len_39 = pattern_len_39 + 1
  932. elif pattern_len == 40:
  933. pattern_len_40 = pattern_len_40 + 1
  934. elif pattern_len == 41:
  935. pattern_len_41 = pattern_len_41 + 1
  936. elif pattern_len == 42:
  937. pattern_len_42 = pattern_len_42 + 1
  938. elif pattern_len == 43:
  939. pattern_len_43 = pattern_len_43 + 1
  940. elif pattern_len == 44:
  941. pattern_len_44 = pattern_len_44 + 1
  942. elif pattern_len == 45:
  943. pattern_len_45 = pattern_len_45 + 1
  944. elif pattern_len == 46:
  945. pattern_len_46 = pattern_len_46 + 1
  946. elif pattern_len == 47:
  947. pattern_len_47 = pattern_len_47 + 1
  948. elif pattern_len == 48:
  949. pattern_len_48 = pattern_len_48 + 1
  950. elif pattern_len == 49:
  951. pattern_len_49 = pattern_len_49 + 1
  952. elif pattern_len == 50:
  953. pattern_len_50 = pattern_len_50 + 1
  954. else:
  955. pass
  956. if pattern_len_1 > 0:
  957. print(" - [length = 1] : [ "+str(pattern_len_1)+" ]")
  958. if pattern_len_2 > 0:
  959. print(" - [length = 2] : [ "+str(pattern_len_2)+" ]")
  960. if pattern_len_3 > 0:
  961. print(" - [length = 3] : [ "+str(pattern_len_3)+" ]")
  962. if pattern_len_4 > 0:
  963. print(" - [length = 4] : [ "+str(pattern_len_4)+" ]")
  964. if pattern_len_5 > 0:
  965. print(" - [length = 5] : [ "+str(pattern_len_5)+" ]")
  966. if pattern_len_6 > 0:
  967. print(" - [length = 6] : [ "+str(pattern_len_6)+" ]")
  968. if pattern_len_7 > 0:
  969. print(" - [length = 7] : [ "+str(pattern_len_7)+" ]")
  970. if pattern_len_8 > 0:
  971. print(" - [length = 8] : [ "+str(pattern_len_8)+" ]")
  972. if pattern_len_9 > 0:
  973. print(" - [length = 9] : [ "+str(pattern_len_9)+" ]")
  974. if pattern_len_10 > 0:
  975. print(" - [length = 10]: [ "+str(pattern_len_10)+" ]")
  976. if pattern_len_11 > 0:
  977. print(" - [length = 11]: [ "+str(pattern_len_11)+" ]")
  978. if pattern_len_12 > 0:
  979. print(" - [length = 12]: [ "+str(pattern_len_12)+" ]")
  980. if pattern_len_13 > 0:
  981. print(" - [length = 13]: [ "+str(pattern_len_13)+" ]")
  982. if pattern_len_14 > 0:
  983. print(" - [length = 14]: [ "+str(pattern_len_14)+" ]")
  984. if pattern_len_15 > 0:
  985. print(" - [length = 15]: [ "+str(pattern_len_15)+" ]")
  986. if pattern_len_16 > 0:
  987. print(" - [length = 16]: [ "+str(pattern_len_16)+" ]")
  988. if pattern_len_17 > 0:
  989. print(" - [length = 17]: [ "+str(pattern_len_17)+" ]")
  990. if pattern_len_18 > 0:
  991. print(" - [length = 18]: [ "+str(pattern_len_18)+" ]")
  992. if pattern_len_19 > 0:
  993. print(" - [length = 19]: [ "+str(pattern_len_19)+" ]")
  994. if pattern_len_20 > 0:
  995. print(" - [length = 20]: [ "+str(pattern_len_20)+" ]")
  996. if pattern_len_21 > 0:
  997. print(" - [length = 21]: [ "+str(pattern_len_21)+" ]")
  998. if pattern_len_22 > 0:
  999. print(" - [length = 22]: [ "+str(pattern_len_22)+" ]")
  1000. if pattern_len_23 > 0:
  1001. print(" - [length = 23]: [ "+str(pattern_len_23)+" ]")
  1002. if pattern_len_24 > 0:
  1003. print(" - [length = 24]: [ "+str(pattern_len_24)+" ]")
  1004. if pattern_len_25 > 0:
  1005. print(" - [length = 25]: [ "+str(pattern_len_25)+" ]")
  1006. if pattern_len_26 > 0:
  1007. print(" - [length = 26]: [ "+str(pattern_len_26)+" ]")
  1008. if pattern_len_27 > 0:
  1009. print(" - [length = 27]: [ "+str(pattern_len_27)+" ]")
  1010. if pattern_len_28 > 0:
  1011. print(" - [length = 28]: [ "+str(pattern_len_28)+" ]")
  1012. if pattern_len_29 > 0:
  1013. print(" - [length = 29]: [ "+str(pattern_len_29)+" ]")
  1014. if pattern_len_30 > 0:
  1015. print(" - [length = 30]: [ "+str(pattern_len_30)+" ]")
  1016. if pattern_len_31 > 0:
  1017. print(" - [length = 31]: [ "+str(pattern_len_31)+" ]")
  1018. if pattern_len_32 > 0:
  1019. print(" - [length = 32]: [ "+str(pattern_len_32)+" ]")
  1020. if pattern_len_33 > 0:
  1021. print(" - [length = 33]: [ "+str(pattern_len_33)+" ]")
  1022. if pattern_len_34 > 0:
  1023. print(" - [length = 34]: [ "+str(pattern_len_34)+" ]")
  1024. if pattern_len_35 > 0:
  1025. print(" - [length = 35]: [ "+str(pattern_len_35)+" ]")
  1026. if pattern_len_36 > 0:
  1027. print(" - [length = 36]: [ "+str(pattern_len_36)+" ]")
  1028. if pattern_len_37 > 0:
  1029. print(" - [length = 37]: [ "+str(pattern_len_37)+" ]")
  1030. if pattern_len_38 > 0:
  1031. print(" - [length = 38]: [ "+str(pattern_len_38)+" ]")
  1032. if pattern_len_39 > 0:
  1033. print(" - [length = 39]: [ "+str(pattern_len_39)+" ]")
  1034. if pattern_len_40 > 0:
  1035. print(" - [length = 40]: [ "+str(pattern_len_40)+" ]")
  1036. if pattern_len_41 > 0:
  1037. print(" - [length = 41]: [ "+str(pattern_len_41)+" ]")
  1038. if pattern_len_42 > 0:
  1039. print(" - [length = 42]: [ "+str(pattern_len_42)+" ]")
  1040. if pattern_len_43 > 0:
  1041. print(" - [length = 43]: [ "+str(pattern_len_43)+" ]")
  1042. if pattern_len_44 > 0:
  1043. print(" - [length = 44]: [ "+str(pattern_len_44)+" ]")
  1044. if pattern_len_45 > 0:
  1045. print(" - [length = 45]: [ "+str(pattern_len_45)+" ]")
  1046. if pattern_len_46 > 0:
  1047. print(" - [length = 46]: [ "+str(pattern_len_46)+" ]")
  1048. if pattern_len_47 > 0:
  1049. print(" - [length = 47]: [ "+str(pattern_len_47)+" ]")
  1050. if pattern_len_48 > 0:
  1051. print(" - [length = 48]: [ "+str(pattern_len_48)+" ]")
  1052. if pattern_len_49 > 0:
  1053. print(" - [length = 49]: [ "+str(pattern_len_49)+" ]")
  1054. if pattern_len_50 > 0:
  1055. print(" - [length = 50]: [ "+str(pattern_len_50)+" ]")
  1056. def print_banner():
  1057. print("\n"+"="*50)
  1058. print(" ____ _ _ _ _ ")
  1059. print("| _ \(_) __ _| \ | | / \ ")
  1060. print("| | | | |/ _` | \| | / _ \ ")
  1061. print("| |_| | | (_| | |\ |/ ___ \ ")
  1062. print("|____/|_|\__,_|_| \_/_/ \_\ by psy")
  1063. print('\n"Search and Recognize patterns in DNA sequences"')
  1064. print("\n"+"="*50)
  1065. print("+ GENOMES DETECTED:", str(num_files))
  1066. print("="*50)
  1067. print("\n"+"-"*15+"\n")
  1068. print(" * VERSION: ")
  1069. print(" + "+VERSION+" - (rev:"+RELEASE+")")
  1070. print("\n * SOURCES:")
  1071. print(" + "+SOURCE1)
  1072. print(" + "+SOURCE2)
  1073. print("\n * CONTACT: ")
  1074. print(" + "+CONTACT+"\n")
  1075. print("-"*15+"\n")
  1076. print("="*50)
  1077. # sub_init #
  1078. num_files=0
  1079. for file in glob.iglob(genomes_path + '**/*', recursive=True):
  1080. if(file.endswith(".genome")):
  1081. num_files = num_files + 1
  1082. f=open(file, 'r')
  1083. genome = f.read().replace('\n',' ')
  1084. genomes[file.replace("datasets/","")] = genome.upper() # add genome to main dict
  1085. f.close()
  1086. print_banner() # show banner
  1087. option = input("\n+ CHOOSE: (S)earch, (L)ist, (T)rain or (R)eport: ").upper()
  1088. print("")
  1089. print("="*50+"\n")
  1090. if option == "S": # search pattern
  1091. search_pattern_with_human()
  1092. elif option == "L": # list genomes
  1093. list_genomes_on_database()
  1094. elif option == "T": # teach AI
  1095. teach_ai()
  1096. else: # libre AI
  1097. libre_ai()
  1098. print ("="*50+"\n")