diana.py 55 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238
  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-"
  3. """
  4. DiaNA - 2020 - by psy (epsylon@riseup.net)
  5. You should have received a copy of the GNU General Public License along
  6. with DiaNA; if not, write to the Free Software Foundation, Inc., 51
  7. Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  8. """
  9. VERSION = "v0.3_beta"
  10. RELEASE = "19032020"
  11. SOURCE1 = "https://code.03c8.net/epsylon/diana"
  12. SOURCE2 = "https://github.com/epsylon/diana"
  13. CONTACT = "epsylon@riseup.net - (https://03c8.net)"
  14. """
  15. DNA-equiv:
  16. A <-> T
  17. C <-> G
  18. """
  19. import re, os, glob, random, time, math
  20. brain_path = "resources/BRAIN/brain.in" # in/out brain-tmp file
  21. genomes_path = 'datasets/' # genome datasets raw data
  22. genomes_list_path = "datasets/genome.list" # genome list
  23. universal_primer_list_path = "resources/PATTERNS/UPL.list" # UPL list
  24. dna_codons_list_path = "resources/PATTERNS/CODONS/DNAcodon.list" # DNA codon list
  25. protein_formula_path = "resources/PATTERNS/CODONS/AAformula.list" # Protein Chemical Formula list
  26. open_reading_frames_init_path = "resources/PATTERNS/ORF/ORF-init.list" # ORF init list
  27. open_reading_frames_end_path = "resources/PATTERNS/ORF/ORF-end.list" # ORF end list
  28. genomes = {} # main sources dict: genome_name
  29. seeds_checked = [] # list used for random checked patterns
  30. repeats = {} # repetitions 'tmp' dict: genome_name:(repets,pattern)
  31. known_patterns = [] # list used for known patterns
  32. max_length = 50 # [MAX. LENGTH] for range [PATTERN]
  33. SUB = str.maketrans("0123456789", "₀₁₂₃₄₅₆₇₈₉")
  34. SUP = str.maketrans("0123456789", "⁰¹²³⁴⁵⁶⁷⁸⁹")
  35. def convert_size(size):
  36. if (size == 0):
  37. return '0 B'
  38. size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
  39. i = int(math.floor(math.log(size,1024)))
  40. p = math.pow(1024,i)
  41. s = round(size/p,2)
  42. return s, size_name[i]
  43. def search_pattern_with_human():
  44. pattern = input("[HUMAN] [SEARCH] Pattern (ex: attacg): ").upper()
  45. print("\n"+"-"*5 + "\n")
  46. create_new_pattern(pattern) # create new pattern
  47. def try_pattern_against_all_genomes_by_genome(pattern):
  48. for k, v in genomes.items():
  49. if pattern in v:
  50. t = len(re.findall(pattern, v))
  51. repeats[k] = t, pattern # create dict: genome = times, pattern
  52. def try_pattern_against_all_genomes_by_pattern(pattern, index):
  53. p_index = 0 # pattern index
  54. for k, v in genomes.items():
  55. if pattern in v:
  56. p_index = p_index + 1
  57. t = len(re.findall(pattern, v))
  58. repeats[index,p_index] = pattern, k, t # create dict: index, p_index = pattern, genome, times
  59. def sanitize_dna_pattern(pattern):
  60. valid_pattern = True
  61. for c in pattern:
  62. if c == "A":
  63. pass
  64. elif c == "T":
  65. pass
  66. elif c == "G":
  67. pass
  68. elif c == "C":
  69. pass
  70. elif c == "N":
  71. pass
  72. else:
  73. valid_pattern = False
  74. return valid_pattern
  75. def teach_ai():
  76. mode = input("[TRAIN-AI] MODE -> (H)uman, (A)utomata: ").upper()
  77. if not os.path.isfile(brain_path):
  78. create_initial_seed_file()
  79. if mode == "H": # human mode
  80. teach_ai_human_mode()
  81. else: # libre AI
  82. teach_ai_automata_mode() # automata mode
  83. def teach_ai_human_mode(): # search/discard patterns with human interaction & generate local database
  84. search_patterns_lesson_with_a_human()
  85. def search_patterns_lesson_with_a_human():
  86. print("\n"+"-"*30)
  87. print("\n[TRAIN-AI] [HUMAN] [STOP] this mode; just entering whatever invalid pattern (ex: 'exit' or 'q').\n")
  88. key = "K" # continue
  89. while key == "K":
  90. pattern = input("[TRAIN-AI] [HUMAN] [LOOP] [SEARCH] Pattern (ex: attacg): ").upper()
  91. print("\n"+"-"*5 + "\n")
  92. key = search_pattern_on_lesson(pattern)
  93. if key == "Z": # stop
  94. break
  95. def search_pattern_on_lesson(pattern):
  96. valid_pattern = sanitize_dna_pattern(pattern)
  97. if valid_pattern == True:
  98. key = search_pattern_on_local_database(pattern) # search pattern on local database
  99. else:
  100. print("[ERROR] -> Invalid DNA pattern ... [EXITING!]\n")
  101. key = "Z" # stop
  102. return key
  103. def search_pattern_on_local_database(pattern):
  104. f=open(brain_path, 'r')
  105. memory = f.read().replace('\n',' ')
  106. f.close()
  107. patterns_known = 0
  108. if not "'"+pattern+"'" in memory: # always create new patterns
  109. create_new_pattern(pattern) # create new pattern
  110. patterns_known = patterns_known + 1
  111. else:
  112. for k, v in genomes.items(): # create patterns found for new genomes
  113. if k not in memory:
  114. create_new_pattern(pattern) # create new pattern
  115. patterns_known = patterns_known + 1
  116. if patterns_known == 0:
  117. print("[TRAIN-AI] [AUTOMATA] [LOOP] [RESULTS] -ALREADY- [LEARNED!] ... -> [GOING FOR NEXT!]\n")
  118. print("-"*5 + "\n")
  119. key = "K" # continue
  120. return key
  121. def create_initial_seed_file():
  122. f=open(brain_path, 'w')
  123. f.write("")
  124. f.close()
  125. def create_new_pattern(pattern): # append it to brain
  126. valid_pattern = sanitize_dna_pattern(pattern)
  127. if valid_pattern == True:
  128. if pattern not in known_patterns:
  129. known_patterns.append(pattern)
  130. try_pattern_against_all_genomes_by_genome(pattern) # generate repeats dict
  131. patterns_found = 0
  132. for k, v in repeats.items(): # list patterns found to output
  133. print (" *", k +":", "-> ",v,"")
  134. patterns_found = patterns_found + 1
  135. print("")
  136. if patterns_found == 0:
  137. print("[INFO] -> Not any found! ... [EXITING!]\n")
  138. else:
  139. f=open(brain_path, 'a')
  140. f.write(str(repeats)+os.linesep) # add dict as str
  141. f.close()
  142. else:
  143. print("[ERROR] -> Invalid DNA pattern ... [EXITING!]\n")
  144. def teach_ai_automata_mode(): # search patterns by bruteforcing ranges & generate local database
  145. search_patterns_lesson_with_an_ai()
  146. def search_patterns_lesson_with_an_ai():
  147. print("\n"+"-"*30)
  148. print("\n[TRAIN-AI] [AUTOMATA] [STOP] this mode; pressing 'CTRL+z'.\n")
  149. ranges = input("[TRAIN-AI] [AUTOMATA] [SEARCH] Set range (x<y) for pattern deep searching (ex: 2-8): ")
  150. print ("")
  151. valid_range, ranged_permutations = check_for_deep_searching_ranges(ranges)
  152. if str(valid_range) == "OK!":
  153. ranged_ending = False
  154. print("-"*15)
  155. print("\n[TRAIN-AI] [AUTOMATA] [SEARCH] Number of [PERMUTATIONS] estimated: [ "+str(ranged_permutations)+" ]\n")
  156. print("-"*15+"\n")
  157. num_pat = 0
  158. while ranged_ending == False: # try to STOP it using: CTRL-z
  159. try:
  160. pattern, ranged_ending = generate_random_pattern(ranges, ranged_permutations) # generate random seed
  161. if pattern:
  162. num_pat = num_pat + 1
  163. print("[TRAIN-AI] [AUTOMATA] [LOOP] [SEARCH] Generating [RANDOM!] ["+str(num_pat)+"/"+str(ranged_permutations)+"] pattern: [ " + str(pattern) + " ]\n")
  164. if not num_pat == ranged_permutations:
  165. search_pattern_on_lesson(pattern)
  166. else:
  167. search_pattern_on_lesson(pattern)
  168. print("[TRAIN-AI] [AUTOMATA] [RESULTS]: REVIEWED -> [ "+str(ranged_permutations)+" PERMUTATIONS ] ... -> [EXITING!]\n")
  169. ranged_ending = True
  170. except:
  171. pass
  172. else:
  173. print("-"*15+"\n")
  174. print("[TRAIN-AI] [AUTOMATA] [ERROR] -> [INVALID!] Deep Learning [RANGE] -> "+valid_range+" ... [EXITING!]\n")
  175. def generate_random_pattern(ranges, ranged_permutations):
  176. ranged_length = 0
  177. try:
  178. range_low = int(ranges.split("-")[0])
  179. range_high = int(ranges.split("-")[1])
  180. for i in range(range_low, range_high+1):
  181. ranged_length = ranged_length + 1
  182. if ranged_length == ranged_permutations: # all possible variables have been bruteforced/checked! -> exit
  183. pattern = None
  184. ranged_ending = True
  185. return pattern, ranged_ending
  186. else:
  187. ranged_ending = False
  188. seed = [random.randrange(0, 4) for _ in range(i)] # generate "random" seed
  189. if seed not in seeds_checked:
  190. seeds_checked.append(seed)
  191. pattern = ""
  192. for n in seed:
  193. if n == 0:
  194. pattern += "A"
  195. elif n == 1:
  196. pattern += "C"
  197. elif n == 2:
  198. pattern += "T"
  199. else:
  200. pattern += "G"
  201. return pattern, ranged_ending
  202. except:
  203. print("[TRAIN-AI] [AUTOMATA] [ERROR] -> [INVALID!] Deep Learning [RANGE] ... [EXITING!]\n")
  204. pattern = None
  205. ranged_ending = True
  206. return pattern, ranged_ending
  207. def check_for_deep_searching_ranges(ranges):
  208. try:
  209. range_low = ranges.split("-")[0]
  210. range_high = ranges.split("-")[1]
  211. except:
  212. valid_range = "'bad format'"
  213. try:
  214. range_low = int(range_low)
  215. except:
  216. valid_range = "'low range' should be an integer"
  217. try:
  218. range_high = int(range_high)
  219. except:
  220. valid_range = "'high range' should be an integer"
  221. try:
  222. if range_low < range_high:
  223. if range_low > 1: # always range > 1
  224. valid_range = "OK!"
  225. else:
  226. valid_range = "'low range' should be > than 1"
  227. else:
  228. valid_range = "'low range' should be < than 'high range'"
  229. except:
  230. valid_range = "'bad format'"
  231. try:
  232. ranged_permutations = math_ranged_permutations(range_low, range_high)
  233. except:
  234. ranged_permutations = 0
  235. valid_range = "'bad format'"
  236. return valid_range, ranged_permutations
  237. def math_ranged_permutations(range_low, range_high): # calculate ranged_permutations
  238. ranged_permutations = 0
  239. for i in range(range_low, range_high+1):
  240. ranged_permutations = ranged_permutations + (4**i)
  241. return ranged_permutations
  242. def libre_ai(): # show statistics / download new genomes / keep crossing new genomes with local database / search for new patterns (non stop!)
  243. if not os.path.isfile(brain_path):
  244. create_initial_seed_file()
  245. memory = examine_stored_brain_memory()
  246. if memory != "":
  247. #print("[LIBRE-AI] [STOP] this mode; pressing 'CTRL+z'.\n")
  248. libre_ai_show_statistics(memory) # show statistics
  249. def libre_ai_show_statistics(memory):
  250. print("[LIBRE-AI] [REPORTING] [STATISTICS] ... -> [STARTING!]\n")
  251. print("-"*15 + "\n")
  252. total_genomes = 0
  253. total_adenine = 0
  254. total_guanine = 0
  255. total_cytosine = 0
  256. total_thymine = 0
  257. total_any = 0
  258. total_patterns = 0
  259. secuence_length = 0
  260. secuences_length_list = {}
  261. largest = None
  262. largest_len = 0
  263. shortest_len = 0
  264. average = None
  265. shortest = None
  266. for k, v in genomes.items():
  267. secuence_length = len(v)
  268. secuences_length_list[k] = str(secuence_length)
  269. total_genomes = total_genomes + 1
  270. total_adenine = total_adenine + v.count("A")
  271. total_guanine = total_guanine + v.count("G")
  272. total_cytosine = total_cytosine + v.count("C")
  273. total_thymine = total_thymine + v.count("T")
  274. total_any = total_any + v.count("N")
  275. path = genomes_path # genome datasets raw data
  276. l = glob.glob(genomes_path+"*") # black magic!
  277. latest_collection_file = max(l, key=os.path.getctime)
  278. latest_collection_date = time.ctime(os.path.getmtime(latest_collection_file))
  279. total_nucleotids = [total_adenine, total_guanine, total_cytosine, total_thymine, total_any]
  280. num_total_nucleotids = total_adenine + total_guanine + total_cytosine + total_thymine + total_any
  281. nucleotid_more_present = max(total_nucleotids)
  282. print("[LIBRE-AI] [REPORTING] -STORAGE- [STATISTICS]: \n")
  283. extract_storage_sizes()
  284. print(" * [LATEST UPDATE]: '"+str(latest_collection_date)+"'\n")
  285. print(" + File: '"+str(latest_collection_file)+"'\n")
  286. print("-"*5 + "\n")
  287. print("[LIBRE-AI] [REPORTING] -COLLECTION- [STATISTICS]: \n")
  288. extract_total_patterns_learned_from_local(memory)
  289. print("\n"+"-"*5 + "\n")
  290. print("[LIBRE-AI] [REPORTING] -ANALYSIS- [STATISTICS]: \n")
  291. print(" * Total [DNA SECUENCES]: [ "+str(total_genomes)+" ]\n")
  292. largest = 0
  293. largest_pattern_name = []
  294. largest_pattern_size = []
  295. for k, v in secuences_length_list.items():
  296. if int(v) > int(largest):
  297. largest = v
  298. largest_pattern_name.append(k)
  299. largest_pattern_size.append(largest)
  300. for p in largest_pattern_name:
  301. largest_pattern_name = p
  302. for s in largest_pattern_size:
  303. largest_pattern_size = s
  304. print(" + [LARGEST] : "+str(largest_pattern_name)+ " [ "+str(largest_pattern_size)+" bp linear RNA ]")
  305. prev_shortest = None
  306. shortest_pattern_name = []
  307. shortest_pattern_size = []
  308. for k, v in secuences_length_list.items():
  309. if prev_shortest == None:
  310. shortest = v
  311. shortest_pattern_name.append(k)
  312. shortest_pattern_size.append(shortest)
  313. prev_shortest = True
  314. else:
  315. if int(v) < int(shortest):
  316. shortest = v
  317. shortest_pattern_name.append(k)
  318. shortest_pattern_size.append(shortest)
  319. for p in shortest_pattern_name:
  320. shortest_pattern_name = p
  321. for s in shortest_pattern_size:
  322. shortest_pattern_size = s
  323. print(" + [SHORTEST]: "+str(shortest_pattern_name)+ " [ "+str(shortest_pattern_size)+" bp linear RNA ]\n")
  324. print(" * Total [NUCLEOTIDS]: [ "+str(num_total_nucleotids)+" ]\n")
  325. if nucleotid_more_present == total_adenine:
  326. print(" + [A] Adenine : "+str(total_adenine)+" <- [MAX]")
  327. else:
  328. print(" + [A] Adenine : "+str(total_adenine))
  329. if nucleotid_more_present == total_guanine:
  330. print(" + [G] Guanine : "+str(total_guanine)+" <- [MAX]")
  331. else:
  332. print(" + [G] Guanine : "+str(total_guanine))
  333. if nucleotid_more_present == total_cytosine:
  334. print(" + [C] Cytosine : "+str(total_cytosine)+" <- [MAX]")
  335. else:
  336. print(" + [C] Cytosine : "+str(total_cytosine))
  337. if nucleotid_more_present == total_thymine:
  338. print(" + [T] Thymine : "+str(total_thymine)+" <- [MAX]")
  339. else:
  340. print(" + [T] Thymine : "+str(total_thymine))
  341. if total_any > 0:
  342. if nucleotid_more_present == total_any:
  343. print(" + [N] *ANY* : "+str(total_any)+" <- [MAX]")
  344. else:
  345. print(" + [N] *ANY* : "+str(total_any))
  346. print("\n"+"-"*5 + "\n")
  347. extract_pattern_most_present_local(memory)
  348. def convert_memory_to_dict(memory): # [index] = genome_name, pattern, num_rep
  349. memory_dict = {}
  350. index = 0
  351. for m in memory:
  352. regex_record = "'(.+?)': (.+?), '(.+?)'" # regex magics! - extract first each record
  353. pattern_record = re.compile(regex_record)
  354. record = re.findall(pattern_record, m)
  355. for r in record: # now extract each field
  356. index = index + 1
  357. name = str(r).split("', '(")[0]
  358. genome_name = str(name).split("'")[1]
  359. repeats = str(r).split("', '(")[1]
  360. genome_repeats = str(repeats).split("',")[0]
  361. pattern = str(repeats).split("',")[1]
  362. genome_pattern = pattern.replace(" ", "")
  363. genome_pattern = genome_pattern.replace("'", "")
  364. genome_pattern = genome_pattern.replace(")", "")
  365. memory_dict[index] = genome_name, genome_pattern, genome_repeats # generate memory_dict!
  366. return memory_dict
  367. def extract_pattern_most_present_local(memory):
  368. memory_dict = convert_memory_to_dict(memory)
  369. if genomes:
  370. try:
  371. f=open(dna_codons_list_path, 'r')
  372. codons = f.readlines()
  373. f.close()
  374. except:
  375. pass
  376. print("[LIBRE-AI] [REPORTING] -RESEARCHING- [STATISTICS]: \n")
  377. total_genomes = 0
  378. for k, v in genomes.items():
  379. total_genomes = total_genomes + 1
  380. if memory_dict:
  381. total_patterns = 0
  382. for m in memory:
  383. total_patterns = total_patterns + 1 # counter used for known patterns
  384. max_size_pattern_name, less_size_pattern_name, biggest_pattern_name, biggest_pattern_size, smaller_pattern_name, smaller_pattern_size, total_patterns_all_genomes, most_present_patterns_by_len_list, less_present_patterns_by_len_list = extract_patterns_most_found_in_all_genomes(memory_dict)
  385. print(" * Searching -[ "+str(total_patterns)+" ]- [PATTERNS LEARNED!] in -[ "+str(total_genomes)+ " ]- [DNA SECUENCES]:")
  386. if total_patterns_all_genomes:
  387. print("\n + Total [PATTERNS FOUND!]: [ "+str(total_patterns_all_genomes)+" ]")
  388. biggest_pattern_name_codon = None
  389. for c in codons:
  390. if c.split(":")[0] == str(biggest_pattern_name):
  391. biggest_pattern_name_codon = str(c.split(":")[1].replace("\n",""))
  392. print("\n - [MOST-PRESENT!]: [ "+str(biggest_pattern_size)+" ] time(s) -> [ "+str(biggest_pattern_name)+" ] "+str(biggest_pattern_name_codon)+"\n")
  393. if biggest_pattern_name_codon == None:
  394. print("\n - [MOST-PRESENT!]: [ "+str(biggest_pattern_size)+" ] time(s) -> [ "+str(biggest_pattern_name)+" ]\n")
  395. other_pattern_name_codon = None
  396. for k, v in most_present_patterns_by_len_list.items():
  397. for c in codons:
  398. if c.split(":")[0] == str(v[0]):
  399. other_pattern_name_codon = str(c.split(":")[1].replace("\n",""))
  400. print(" * [length = "+str(k)+"] : [ "+str(v[1])+" ] time(s) -> [ "+str(v[0])+" ] "+str(other_pattern_name_codon))
  401. if other_pattern_name_codon == None:
  402. print(" * [length = "+str(k)+"] : [ "+str(v[1])+" ] time(s) -> [ "+str(v[0])+" ]")
  403. other_pattern_name_codon = None
  404. smaller_pattern_name_codon = None
  405. for c in codons:
  406. if c.split(":")[0] == str(smaller_pattern_name):
  407. smaller_pattern_name_codon = str(c.split(":")[1].replace("\n",""))
  408. print("\n - [LESS-PRESENT!]: [ "+str(smaller_pattern_size)+" ] time(s) -> [ "+str(smaller_pattern_name)+" ] "+str(smaller_pattern_name_codon)+"\n")
  409. if smaller_pattern_name_codon == None:
  410. print("\n - [LESS-PRESENT!]: [ "+str(smaller_pattern_size)+" ] time(s) -> [ "+str(smaller_pattern_name)+" ]\n")
  411. other_pattern_name_codon = None
  412. for n, m in less_present_patterns_by_len_list.items():
  413. for c in codons:
  414. if c.split(":")[0] == str(m[0]):
  415. other_pattern_name_codon = str(c.split(":")[1].replace("\n",""))
  416. print(" * [length = "+str(n)+"] : [ "+str(m[1])+" ] time(s) -> [ "+str(m[0])+" ] "+str(other_pattern_name_codon))
  417. if other_pattern_name_codon == None:
  418. print(" * [length = "+str(n)+"] : [ "+str(m[1])+" ] time(s) -> [ "+str(m[0])+" ]")
  419. other_pattern_name_codon = None
  420. max_size_pattern_name = max(most_present_patterns_by_len_list, key=most_present_patterns_by_len_list.get)
  421. less_size_pattern_name = min(most_present_patterns_by_len_list, key=most_present_patterns_by_len_list.get)
  422. print("\n - [LARGEST] : [ "+str(max_size_pattern_name)+" ] bp linear RNA")
  423. print(" - [SHORTEST]: [ "+str(less_size_pattern_name)+" ] bp linear RNA\n")
  424. else:
  425. print("\n + Total [PATTERNS FOUND!]: [ 0 ]\n")
  426. try:
  427. f=open(universal_primer_list_path, 'r')
  428. UPL = f.readlines()
  429. f.close()
  430. if UPL:
  431. extract_potential_primer_pairs(UPL, total_genomes, codons)
  432. except:
  433. pass
  434. if codons:
  435. extract_potential_dna_codons(codons, total_genomes)
  436. def extract_potential_primer_pairs(UPL, total_genomes, codons):
  437. total_universal_primer_pairs = 0
  438. total_primer_pairs_found = 0
  439. primer_pairs_found_list = {}
  440. for pp in UPL:
  441. total_universal_primer_pairs = total_universal_primer_pairs + 1
  442. for k, v in genomes.items():
  443. pair_name = pp.split(":")[1].upper().replace("\n","")
  444. pair_sec = pp.split(":")[0]
  445. if str(pair_name) in str(v.upper()):
  446. pair_times = v.count(pair_name)
  447. total_primer_pairs_found += pair_times
  448. primer_pairs_found_list[pair_sec] = pair_name, total_primer_pairs_found
  449. print(" "+"-"*5+"\n")
  450. print(" * Searching -[ "+str(total_universal_primer_pairs)+" ]- [UNIVERSAL PRIMER PAIRS!] in -[ "+str(total_genomes)+ " ]- [DNA SECUENCES]:")
  451. if total_primer_pairs_found:
  452. total_primer_pairs_found_list = 0
  453. for m, n in primer_pairs_found_list.items():
  454. total_primer_pairs_found_list = total_primer_pairs_found_list + n[1]
  455. print("\n + Total [UNIVERSAL PRIMER PAIRS FOUND!]: [ "+str(total_primer_pairs_found_list)+" ]\n")
  456. for m, n in primer_pairs_found_list.items():
  457. print(" * "+str(m)+" -> [ "+str(n[0])+" ] : [ "+str(n[1])+" ] time(s)")
  458. print ("")
  459. else:
  460. print("\n + Total [UNIVERSAL PRIMER PAIRS FOUND!]: [ 0 ]\n")
  461. print(" "+"-"*5+"\n")
  462. def extract_potential_dna_codons(codons, total_genomes):
  463. total_codons = 0
  464. total_codons_found = 0
  465. codons_found_list = {}
  466. codons_found_list_by_codon = {}
  467. index = 0
  468. for c in codons:
  469. total_codons = total_codons + 1
  470. for k, v in genomes.items():
  471. codon_name = c.split(":")[0].upper().replace("\n","")
  472. if str(codon_name) in str(v.upper()):
  473. index = index + 1
  474. codons_times = v.count(codon_name)
  475. total_codons_found += codons_times
  476. codons_found_list[index] = codons_times, c.split(":")[0], str(c.split(":")[1]), k
  477. print(" * Searching -[ "+str(total_codons)+" ]- [AMINO ACIDS!] in -[ "+str(total_genomes)+ " ]- [DNA SECUENCES]:")
  478. if total_codons_found:
  479. for m, n in codons_found_list.items():
  480. codon_sec = str(n[1])
  481. codon_name = str(n[2].replace("\n",""))
  482. if not codon_sec in codons_found_list_by_codon.keys():
  483. codons_found_list_by_codon[codon_sec] = codon_name, m
  484. else:
  485. for r, s in codons_found_list_by_codon.items():
  486. if codon_sec == r:
  487. new_v = s[1] + m
  488. codons_found_list_by_codon[codon_sec] = codon_name, new_v
  489. codons_found_list_by_name = {}
  490. for g,z in codons_found_list_by_codon.items():
  491. if not z[0] in codons_found_list_by_name.keys():
  492. codons_found_list_by_name[z[0]]= z[1]
  493. else:
  494. for e, q in codons_found_list_by_name.items():
  495. if z[0] == e:
  496. new_s = q + z[1]
  497. codons_found_list_by_name[z[0]] = new_s
  498. total_codons_by_codon = 0
  499. for p, f in codons_found_list_by_name.items():
  500. total_codons_by_codon = total_codons_by_codon + f
  501. print("\n + Total [AMINO ACIDS FOUND!]: [ "+str(total_codons_by_codon)+" ]\n")
  502. most_present_codons_found = max(codons_found_list_by_name, key=codons_found_list_by_name.get)
  503. less_present_codons_found = min(codons_found_list_by_name, key=codons_found_list_by_name.get)
  504. print(" - [MOST-PRESENT!]: "+str(most_present_codons_found))
  505. print(" - [LESS-PRESENT!]: "+str(less_present_codons_found)+"\n")
  506. for p, f in codons_found_list_by_name.items():
  507. print(" * "+str(p)+" : "+str(f)+" time(s)")
  508. print ("")
  509. else:
  510. print("\n + Total [AMINO ACIDS FOUND!]: [ 0 ]\n")
  511. print(" "+"-"*5+"\n")
  512. if total_genomes > 0:
  513. extract_protein_secuence(total_genomes, codons_found_list, codons)
  514. def extract_open_reading_frames(total_genomes):
  515. try:
  516. f=open(open_reading_frames_init_path, 'r')
  517. frames_init = f.readlines()
  518. f.close()
  519. except:
  520. pass
  521. try:
  522. e=open(open_reading_frames_end_path, 'r')
  523. frames_end = e.readlines()
  524. e.close()
  525. except:
  526. pass
  527. if frames_init and frames_end:
  528. print(" * Searching [OPEN READING FRAMES!] in -[ "+str(total_genomes)+ " ]- [DNA SECUENCES]:")
  529. total_opr_found = 0
  530. r_found_by_pattern = 0
  531. opr_found_list = {}
  532. index = 0
  533. for k, v in genomes.items():
  534. for opr_i in frames_init:
  535. opr_init_name = opr_i.replace("\n","")
  536. if str(opr_init_name) in str(v.upper()): # open reading INIT frame found!
  537. for opr_e in frames_end:
  538. opr_end_name = opr_e.replace("\n","")
  539. if str(opr_end_name) in str(v.upper()): # open reading END frame found!
  540. regex_opr = str(opr_init_name) +"(.+?)"+str(opr_end_name) # regex magics! - extract secuence between ocr_i and ocr_e
  541. pattern_record = re.compile(regex_opr)
  542. record = re.findall(pattern_record, str(v.upper()))
  543. for r in record: # now extract each field
  544. total_opr_found = total_opr_found + 1
  545. r_found_by_pattern = v.count(opr_init_name+r+opr_end_name)
  546. index = index + 1
  547. opr_found_list[index] = k, r_found_by_pattern, opr_init_name, r, opr_end_name # [index]: genome, num_times, opr_i, pattern, opr_e
  548. if total_opr_found > 0:
  549. print("\n + Total [OPEN READING FRAMES FOUND!]: [ "+str(total_opr_found)+" ]\n")
  550. most_present_opr_found = max(opr_found_list, key=opr_found_list.get)
  551. largest_pattern = 0
  552. largest_pattern_found = None
  553. for m, n in opr_found_list.items():
  554. opr_found_init = str(n[2])
  555. opr_found_pattern = str(n[3])
  556. opr_found_end = str(n[4])
  557. opr_found_times = str(n[1])
  558. opr_found_genome = str(n[0])
  559. opr_found_pattern_len = len(opr_found_pattern)
  560. if opr_found_pattern_len > largest_pattern:
  561. largest_pattern = opr_found_pattern_len
  562. largest_pattern_found = opr_found_init, opr_found_pattern, opr_found_end, opr_found_genome
  563. if m == most_present_opr_found:
  564. most_present_opr_found_init = str(n[2])
  565. most_present_opr_found_pattern = str(n[3])
  566. most_present_opr_found_end = str(n[4])
  567. most_present_opr_found_times = str(n[1])
  568. most_present_opr_found_genome = str(n[0])
  569. print(" - [MOST-PRESENT!]: [ "+str(most_present_opr_found_times)+" ] time(s) found in [ "+str(most_present_opr_found_genome)+" ] is -> [ "+str(most_present_opr_found_init)+"-{?}-"+str(most_present_opr_found_end)+" ]:\n")
  570. print(str(" * "+str(most_present_opr_found_init+most_present_opr_found_pattern+most_present_opr_found_end)))
  571. print("\n - [LARGEST]: [ "+str(len(largest_pattern_found[1]))+" bp linear RNA ] found in [ "+str(largest_pattern_found[3])+" ] is -> [ "+str(largest_pattern_found[0])+"-{?}-"+str(largest_pattern_found[2])+" ]:\n")
  572. print(str(" * "+str(largest_pattern_found[0]+largest_pattern_found[1]+largest_pattern_found[2])+"\n"))
  573. else:
  574. print("\n + Total [OPEN READING FRAMES FOUND!]: [ 0 ]\n")
  575. else:
  576. print("\n + Total [OPEN READING FRAMES FOUND!]: [ 0 ]\n")
  577. def extract_protein_secuence(total_genomes, codons_found_list, codons):
  578. print(" * Searching [PROTEINS!] in -[ "+str(total_genomes)+ " ]- [DNA SECUENCES]:\n")
  579. total_protein_secuences_found = 0
  580. protein_secuences_list = {}
  581. index = 0
  582. p = {}
  583. for c in codons:
  584. codon_sec = c.split(":")[0]
  585. codon_name = c.split(":")[1].replace("\n","")
  586. p[codon_sec] = codon_name
  587. for k, v in genomes.items():
  588. ps = ""
  589. dna = str(v)
  590. for i in range(0, len(dna)-(3+len(dna)%3), 3): # searching protein secuence
  591. if "Stop" in p[dna[i:i+3]]:
  592. break
  593. ps += p[dna[i:i+3]].split("(")[1].split(")")[0]
  594. index = index + 1
  595. total_protein_secuences_found = total_protein_secuences_found + 1
  596. protein_secuences_list[index] = ps, k
  597. ps = "" # clean protein secuence
  598. if total_protein_secuences_found > 0:
  599. protein_most_present = {}
  600. for value in protein_secuences_list.values():
  601. if value[0] in protein_most_present.keys():
  602. protein_most_present[value[0]] = protein_most_present[value[0]] + 1
  603. else:
  604. protein_most_present[value[0]] = 1
  605. most_present_protein_found = max(protein_most_present, key=protein_most_present.get)
  606. print(" + Total [PROTEINS FOUND!]: [ "+str(total_protein_secuences_found)+" ]\n")
  607. largest_protein_secuence = 0
  608. largest_protein_secuence_found = None
  609. most_present_protein_found_counter = 0
  610. for m, n in protein_secuences_list.items():
  611. if most_present_protein_found == n[0]:
  612. most_present_protein_found_counter = most_present_protein_found_counter + 1
  613. protein_secuence_pattern_len = len(str(n[0]))
  614. if protein_secuence_pattern_len > largest_protein_secuence:
  615. largest_protein_secuence = protein_secuence_pattern_len
  616. largest_protein_secuence_found = m, n
  617. print(" - [MOST-PRESENT!]: [ "+str(most_present_protein_found_counter)+" ] time(s) is -> [ "+str(most_present_protein_found)+" ]\n")
  618. protein_chemical_formula = ""
  619. f = open(protein_formula_path, "r")
  620. formulas = f.readlines()
  621. f.close()
  622. for a in most_present_protein_found:
  623. for f in formulas:
  624. if a == f.split(":")[0]:
  625. protein_chemical_formula += str(f.split(":")[1].replace("\n","")+"+")
  626. pcfl = len(protein_chemical_formula)
  627. protein_chemical_final_formula = protein_chemical_formula[:pcfl-1].translate(SUB)
  628. print(" *", protein_chemical_final_formula+"\n")
  629. print(" - [LARGEST]: [ "+str(len(largest_protein_secuence_found[1][0]))+" bp linear RNA ] found in [ "+str(largest_protein_secuence_found[1][1])+" ] is -> [ "+str(largest_protein_secuence_found[1][0])+" ]\n")
  630. largest_protein_chemical_formula = ""
  631. for a in largest_protein_secuence_found[1][0]:
  632. for f in formulas:
  633. if a == f.split(":")[0]:
  634. largest_protein_chemical_formula += str(f.split(":")[1].replace("\n","")+"+")
  635. pcfl = len(largest_protein_chemical_formula)
  636. largest_protein_chemical_final_formula = largest_protein_chemical_formula[:pcfl-1].translate(SUB)
  637. print(" *", largest_protein_chemical_final_formula+"\n")
  638. else:
  639. print("\n + Total [PROTEINS FOUND!]: [ 0 ]\n")
  640. print(" "+"-"*5+"\n")
  641. if codons_found_list:
  642. extract_open_reading_frames(total_genomes)
  643. def extract_patterns_most_found_in_all_genomes(memory_dict):
  644. present_patterns = []
  645. for m, p in memory_dict.items():
  646. pattern = p[1]
  647. if pattern not in present_patterns:
  648. present_patterns.append(pattern)
  649. index = 0 # genome num index
  650. for pattern in present_patterns:
  651. index = index + 1
  652. try_pattern_against_all_genomes_by_pattern(pattern, index)
  653. total_patterns_all_genomes = 0
  654. largest_size_by_pattern = {}
  655. largest_size_by_pattern_index = 0
  656. for k,v in repeats.items():
  657. largest_size_by_pattern_index = largest_size_by_pattern_index + 1
  658. largest_size_by_pattern[largest_size_by_pattern_index] = v[0], v[2]
  659. total_patterns_by_pattern = 0
  660. list_total_patterns_by_pattern = {}
  661. for i, v in largest_size_by_pattern.items():
  662. total_patterns_by_pattern = total_patterns_by_pattern + v[1]
  663. list_total_patterns_by_pattern[v[0]] = total_patterns_by_pattern
  664. biggest_pattern_name = None
  665. biggest_pattern_size = 0
  666. smaller_pattern_name = None
  667. smaller_pattern_size = 0
  668. max_size_pattern = 0
  669. for r, z in list_total_patterns_by_pattern.items():
  670. total_patterns_all_genomes = total_patterns_all_genomes + z
  671. pattern_length = len(r)
  672. if pattern_length > max_size_pattern:
  673. max_size_pattern_name = r
  674. if biggest_pattern_name == None:
  675. biggest_pattern_name = r
  676. smaller_pattern_name = r
  677. biggest_pattern_size = z
  678. smaller_pattern_size = z
  679. less_size_pattern_name = r
  680. less_size_pattern_size = z
  681. else:
  682. if pattern_length < less_size_pattern_size:
  683. less_size_pattern_size = pattern_length
  684. less_size_pattern_name = r
  685. if z > biggest_pattern_size:
  686. biggest_pattern_name = r
  687. biggest_pattern_size = z
  688. else:
  689. if z < smaller_pattern_size:
  690. smaller_pattern_name = r
  691. smaller_pattern_size = z
  692. most_present_patterns_by_len_list = extract_most_present_pattern_by_len(list_total_patterns_by_pattern)
  693. less_present_patterns_by_len_list = extract_less_present_pattern_by_len(list_total_patterns_by_pattern)
  694. return max_size_pattern_name, less_size_pattern_name, biggest_pattern_name, biggest_pattern_size, smaller_pattern_name, smaller_pattern_size, total_patterns_all_genomes, most_present_patterns_by_len_list, less_present_patterns_by_len_list
  695. def extract_most_present_pattern_by_len(list_total_patterns_by_pattern):
  696. most_present_patterns_by_len_list = {}
  697. for k, v in list_total_patterns_by_pattern.items():
  698. pattern_len = len(k)
  699. if pattern_len in most_present_patterns_by_len_list.keys():
  700. if v > most_present_patterns_by_len_list[pattern_len][1]:
  701. most_present_patterns_by_len_list[pattern_len] = k, v
  702. else:
  703. most_present_patterns_by_len_list[pattern_len] = k, v
  704. return most_present_patterns_by_len_list
  705. def extract_less_present_pattern_by_len(list_total_patterns_by_pattern):
  706. less_present_patterns_by_len_list = {}
  707. for k, v in list_total_patterns_by_pattern.items():
  708. pattern_len = len(k)
  709. if pattern_len in less_present_patterns_by_len_list.keys():
  710. if v < less_present_patterns_by_len_list[pattern_len][1]:
  711. less_present_patterns_by_len_list[pattern_len] = k, v
  712. else:
  713. less_present_patterns_by_len_list[pattern_len] = k, v
  714. return less_present_patterns_by_len_list
  715. def extract_storage_sizes():
  716. total_dataset_size = 0
  717. total_files_size = 0
  718. total_list_size = 0
  719. for file in glob.iglob(genomes_path + '*/*/*', recursive=True): # extract datasets sizes
  720. if(file.endswith(".genome")):
  721. total_dataset_size = total_dataset_size + len(file)
  722. try:
  723. f=open(brain_path, "r") # extract brain sizes
  724. total_brain_size = len(f.read())
  725. f.close()
  726. except:
  727. total_brain_size = 0
  728. try:
  729. f=open(genomes_list_path, "r") # extract genomes list sizes
  730. total_list_size = len(f.read())
  731. f.close()
  732. except:
  733. total_list_size = 0
  734. if total_dataset_size > 0:
  735. total_files_size = int(total_files_size) + int(total_dataset_size)
  736. dataset_s, dataset_size_name = convert_size(total_dataset_size)
  737. total_dataset_size = '%s %s' % (dataset_s,dataset_size_name)
  738. if total_brain_size > 0:
  739. total_files_size = int(total_files_size) + int(total_brain_size)
  740. brain_s, brain_size_name = convert_size(total_brain_size)
  741. total_brain_size = '%s %s' % (brain_s,brain_size_name)
  742. if total_list_size > 0:
  743. total_files_size = int(total_files_size) + int(total_list_size)
  744. list_s, list_size_name = convert_size(total_list_size)
  745. total_list_size = '%s %s' % (list_s,list_size_name)
  746. total_s, total_size_name = convert_size(total_files_size)
  747. total_files_size = '%s %s' % (total_s,total_size_name)
  748. print(" * Total [FILE SIZES]: "+str(total_files_size)+"\n")
  749. if total_dataset_size:
  750. print(" + [DATASET]: "+str(total_dataset_size)+"\n")
  751. if total_list_size:
  752. print(" + [LIST]: "+str(total_list_size)+"\n")
  753. if total_brain_size:
  754. print(" + [BRAIN]: "+str(total_brain_size)+"\n")
  755. def extract_total_patterns_learned_from_local(memory):
  756. total_patterns = 0
  757. for m in memory:
  758. total_patterns = total_patterns + 1
  759. print(" * [SETTINGS] Using [MAX. LENGTH] for range [PATTERN] = [ "+str(max_length)+" ]\n")
  760. if total_patterns > 0:
  761. print(" + [PATTERNS LEARNED!]: [ "+str(total_patterns)+" ]\n")
  762. else:
  763. print(" + [PATTERNS LEARNED!]: [ "+str(total_patterns)+" ]")
  764. generate_pattern_len_report_structure(memory)
  765. return memory
  766. def list_genomes_on_database():
  767. print("[LIST] [REPORTING] [DNA SECUENCES] ... -> [STARTING!]\n")
  768. f=open(dna_codons_list_path, 'r')
  769. codons = f.readlines()
  770. f.close()
  771. print("-"*15 + "\n")
  772. f=open(open_reading_frames_init_path, 'r')
  773. frames_init = f.readlines()
  774. f.close()
  775. f=open(open_reading_frames_end_path, 'r')
  776. frames_end = f.readlines()
  777. f.close()
  778. f = open(protein_formula_path, "r")
  779. formulas = f.readlines()
  780. f.close()
  781. f=open(genomes_list_path, 'w')
  782. p = {}
  783. for k, v in genomes.items():
  784. total_protein_secuences_found = 0
  785. print ("="*20+"\n")
  786. f.write(str("="*20+"\n\n"))
  787. print ("* "+str(k))
  788. print ("\n + Total [NUCLEOTIDS FOUND!]: [ "+str(len(v)-1)+" bp linear RNA ]\n")
  789. print (" - [A] Adenine :", str(v.count("A")))
  790. print (" - [G] Guanine :", str(v.count("G")))
  791. print (" - [C] Cytosine :", str(v.count("C")))
  792. print (" - [T] Thymine :", str(v.count("T")))
  793. f.write(str("* "+str(k)+"\n"))
  794. f.write(str("\n + Total [NUCLEOTIDS FOUND!]: [ "+str(len(v)-1)+" bp linear RNA ]\n"))
  795. f.write(str(" - [A] Adenine : " + str(v.count("A"))+"\n"))
  796. f.write(str(" - [G] Guanine : " + str(v.count("G"))+"\n"))
  797. f.write(str(" - [C] Cytosine : " + str(v.count("C"))+"\n"))
  798. f.write(str(" - [T] Thymine : " + str(v.count("T"))+"\n"))
  799. if v.count("N") > 0:
  800. print (" - [N] *ANY* :", str(v.count("N")))
  801. f.write(str(" - [N] *ANY* : "+ str(v.count("N"))+"\n"))
  802. total_codons = 0
  803. for c in codons:
  804. codon_counter = v.count(str(c.split(":")[0]))
  805. total_codons = total_codons + codon_counter
  806. codon_sec = c.split(":")[0]
  807. codon_name = c.split(":")[1].replace("\n","")
  808. p[codon_sec] = codon_name
  809. print ("\n + Total [AMINO ACIDS FOUND!]: [ "+str(total_codons)+" ]\n")
  810. f.write(str("\n + Total [AMINO ACIDS FOUND!]: [ "+str(total_codons)+" ]\n"))
  811. for c in codons:
  812. codon_sec = str(c.split(":")[0])
  813. codon_name = str(c.split(":")[1].replace("\n",""))
  814. codon_counter = str(v.count(str(c.split(":")[0])))
  815. print (" - ["+codon_sec+"] "+codon_name+" :", codon_counter)
  816. f.write(str(" - ["+codon_sec+"] "+codon_name+" : "+ codon_counter)+"\n")
  817. ps = ""
  818. dna = str(v)
  819. for i in range(0, len(dna)-(3+len(dna)%3), 3): # searching protein secuence
  820. if "Stop" in p[dna[i:i+3]]:
  821. break
  822. ps += p[dna[i:i+3]].split("(")[1].split(")")[0]
  823. total_protein_secuences_found = total_protein_secuences_found + 1
  824. print ("\n + Total [PROTEINS FOUND!]: [ "+str(total_protein_secuences_found)+" ]\n")
  825. f.write(str("\n + Total [PROTEINS FOUND!]: [ "+str(total_protein_secuences_found)+" ]\n"))
  826. protein_chemical_formula = ""
  827. for a in ps:
  828. for formula in formulas:
  829. if a == formula.split(":")[0]:
  830. protein_chemical_formula += str(formula.split(":")[1].replace("\n","")+"+")
  831. pcfl = len(protein_chemical_formula)
  832. protein_chemical_final_formula = protein_chemical_formula[:pcfl-1].translate(SUB)
  833. print (" - ["+ps+"] : "+protein_chemical_final_formula)
  834. f.write(str(" - ["+ps+"] : "+protein_chemical_final_formula)+"\n")
  835. ps = "" # clean protein secuence
  836. if frames_init and frames_end:
  837. total_opr_found = 0
  838. r_found_by_pattern = 0
  839. opr_found_list = {}
  840. index = 0
  841. for opr_i in frames_init:
  842. opr_init_name = opr_i.replace("\n","")
  843. if str(opr_init_name) in str(v.upper()): # open reading INIT frame found!
  844. for opr_e in frames_end:
  845. opr_end_name = opr_e.replace("\n","")
  846. if str(opr_end_name) in str(v.upper()): # open reading END frame found!
  847. regex_opr = str(opr_init_name) +"(.+?)"+str(opr_end_name) # regex magics! - extract secuence between ocr_i and ocr_e
  848. pattern_record = re.compile(regex_opr)
  849. record = re.findall(pattern_record, str(v.upper()))
  850. for r in record: # now extract each field
  851. total_opr_found = total_opr_found + 1
  852. r_found_by_pattern = v.count(opr_init_name+r+opr_end_name)
  853. index = index + 1
  854. opr_found_list[index] = k, r_found_by_pattern, opr_init_name, r, opr_end_name # [index]: genome, num_times, opr_i, pattern, opr_e
  855. print ("\n + Total [OPEN READING FRAMES FOUND!]: [ "+str(total_opr_found)+" ]")
  856. f.write(str("\n + Total [OPEN READING FRAMES FOUND!]: [ "+str(total_opr_found)+" ] \n"))
  857. for m, n in opr_found_list.items():
  858. #print(" - ["+str(n[2])+str(n[3])+str(n[4])+"] : [ "+str(n[1])+" ] time(s)")
  859. f.write(str(" - ["+str(n[2])+str(n[3])+str(n[4])+"] : "+ str(n[1]))+"\n")
  860. print ("")
  861. f.write("\n")
  862. print("-"*15 + "\n")
  863. print ("[LIST] [INFO] [SAVED!] at: '"+str(genomes_list_path)+"'... -> [EXITING!]\n")
  864. f.close()
  865. def examine_stored_brain_memory():
  866. memory = [] # list used as hot-memory
  867. f=open(brain_path, 'r')
  868. for line in f.readlines():
  869. if line not in memory:
  870. memory.append(line)
  871. f.close()
  872. if memory == "": # first time run!
  873. print ("[LIBRE-AI] [INFO] Not any [BRAIN] present ... -> [BUILDING ONE!]\n")
  874. print("-"*15 + "\n")
  875. for i in range(2, 11+1):
  876. seed = [random.randrange(0, 4) for _ in range(i)] # generate "static" genesis seed
  877. if seed not in seeds_checked:
  878. seeds_checked.append(seed)
  879. pattern = ""
  880. for n in seed:
  881. if n == 0:
  882. pattern += "A"
  883. elif n == 1:
  884. pattern += "C"
  885. elif n == 2:
  886. pattern += "T"
  887. else:
  888. pattern += "G"
  889. print("[LIBRE-AI] [SEARCH] Generating [RANDOM] pattern: " + str(pattern) + "\n")
  890. create_new_pattern(pattern) # create new pattern
  891. print("-"*15 + "\n")
  892. print ("[LIBRE-AI] [INFO] A new [BRAIN] has been created !!! ... -> [ADVANCING!]\n")
  893. f=open(brain_path, 'r')
  894. memory = f.read().replace('\n',' ')
  895. f.close()
  896. return memory
  897. def generate_pattern_len_report_structure(memory):
  898. pattern_len_1 = 0 # related with [MAX. LENGTH] range
  899. pattern_len_2 = 0
  900. pattern_len_3 = 0
  901. pattern_len_4 = 0
  902. pattern_len_5 = 0
  903. pattern_len_6 = 0
  904. pattern_len_7 = 0
  905. pattern_len_8 = 0
  906. pattern_len_9 = 0
  907. pattern_len_10 = 0
  908. pattern_len_11 = 0
  909. pattern_len_12 = 0
  910. pattern_len_13 = 0
  911. pattern_len_14 = 0
  912. pattern_len_15 = 0
  913. pattern_len_16 = 0
  914. pattern_len_17 = 0
  915. pattern_len_18 = 0
  916. pattern_len_19 = 0
  917. pattern_len_20 = 0
  918. pattern_len_21 = 0
  919. pattern_len_22 = 0
  920. pattern_len_23 = 0
  921. pattern_len_24 = 0
  922. pattern_len_25 = 0
  923. pattern_len_26 = 0
  924. pattern_len_27 = 0
  925. pattern_len_28 = 0
  926. pattern_len_29 = 0
  927. pattern_len_30 = 0
  928. pattern_len_31 = 0
  929. pattern_len_32 = 0
  930. pattern_len_33 = 0
  931. pattern_len_34 = 0
  932. pattern_len_35 = 0
  933. pattern_len_36 = 0
  934. pattern_len_37 = 0
  935. pattern_len_38 = 0
  936. pattern_len_39 = 0
  937. pattern_len_40 = 0
  938. pattern_len_41 = 0
  939. pattern_len_42 = 0
  940. pattern_len_43 = 0
  941. pattern_len_44 = 0
  942. pattern_len_45 = 0
  943. pattern_len_46 = 0
  944. pattern_len_47 = 0
  945. pattern_len_48 = 0
  946. pattern_len_49 = 0
  947. pattern_len_50 = 0
  948. for m in memory:
  949. try:
  950. pattern_len = m.split(", '")[1]
  951. pattern_len = pattern_len.split("')")[0]
  952. pattern_len = len(pattern_len)
  953. except:
  954. pattern_len = 0 # discard!
  955. if pattern_len == 1:
  956. pattern_len_1 = pattern_len_1 + 1
  957. elif pattern_len == 2:
  958. pattern_len_2 = pattern_len_2 + 1
  959. elif pattern_len == 3:
  960. pattern_len_3 = pattern_len_3 + 1
  961. elif pattern_len == 4:
  962. pattern_len_4 = pattern_len_4 + 1
  963. elif pattern_len == 5:
  964. pattern_len_5 = pattern_len_5 + 1
  965. elif pattern_len == 6:
  966. pattern_len_6 = pattern_len_6 + 1
  967. elif pattern_len == 7:
  968. pattern_len_7 = pattern_len_7 + 1
  969. elif pattern_len == 8:
  970. pattern_len_8 = pattern_len_8 + 1
  971. elif pattern_len == 9:
  972. pattern_len_9 = pattern_len_9 + 1
  973. elif pattern_len == 10:
  974. pattern_len_10 = pattern_len_10 + 1
  975. elif pattern_len == 11:
  976. pattern_len_11 = pattern_len_11 + 1
  977. elif pattern_len == 12:
  978. pattern_len_12 = pattern_len_12 + 1
  979. elif pattern_len == 13:
  980. pattern_len_13 = pattern_len_13 + 1
  981. elif pattern_len == 14:
  982. pattern_len_14 = pattern_len_14 + 1
  983. elif pattern_len == 15:
  984. pattern_len_15 = pattern_len_15 + 1
  985. elif pattern_len == 16:
  986. pattern_len_16 = pattern_len_16 + 1
  987. elif pattern_len == 17:
  988. pattern_len_17 = pattern_len_17 + 1
  989. elif pattern_len == 18:
  990. pattern_len_18 = pattern_len_18 + 1
  991. elif pattern_len == 19:
  992. pattern_len_19 = pattern_len_19 + 1
  993. elif pattern_len == 20:
  994. pattern_len_20 = pattern_len_20 + 1
  995. elif pattern_len == 21:
  996. pattern_len_21 = pattern_len_21 + 1
  997. elif pattern_len == 22:
  998. pattern_len_22 = pattern_len_22 + 1
  999. elif pattern_len == 23:
  1000. pattern_len_23 = pattern_len_23 + 1
  1001. elif pattern_len == 24:
  1002. pattern_len_24 = pattern_len_24 + 1
  1003. elif pattern_len == 25:
  1004. pattern_len_25 = pattern_len_25 + 1
  1005. elif pattern_len == 26:
  1006. pattern_len_26 = pattern_len_26 + 1
  1007. elif pattern_len == 27:
  1008. pattern_len_27 = pattern_len_27 + 1
  1009. elif pattern_len == 28:
  1010. pattern_len_28 = pattern_len_28 + 1
  1011. elif pattern_len == 29:
  1012. pattern_len_29 = pattern_len_29 + 1
  1013. elif pattern_len == 30:
  1014. pattern_len_30 = pattern_len_30 + 1
  1015. elif pattern_len == 31:
  1016. pattern_len_31 = pattern_len_31 + 1
  1017. elif pattern_len == 32:
  1018. pattern_len_32 = pattern_len_32 + 1
  1019. elif pattern_len == 33:
  1020. pattern_len_33 = pattern_len_33 + 1
  1021. elif pattern_len == 34:
  1022. pattern_len_34 = pattern_len_34 + 1
  1023. elif pattern_len == 35:
  1024. pattern_len_35 = pattern_len_35 + 1
  1025. elif pattern_len == 36:
  1026. pattern_len_36 = pattern_len_36 + 1
  1027. elif pattern_len == 37:
  1028. pattern_len_37 = pattern_len_37 + 1
  1029. elif pattern_len == 38:
  1030. pattern_len_38 = pattern_len_38 + 1
  1031. elif pattern_len == 39:
  1032. pattern_len_39 = pattern_len_39 + 1
  1033. elif pattern_len == 40:
  1034. pattern_len_40 = pattern_len_40 + 1
  1035. elif pattern_len == 41:
  1036. pattern_len_41 = pattern_len_41 + 1
  1037. elif pattern_len == 42:
  1038. pattern_len_42 = pattern_len_42 + 1
  1039. elif pattern_len == 43:
  1040. pattern_len_43 = pattern_len_43 + 1
  1041. elif pattern_len == 44:
  1042. pattern_len_44 = pattern_len_44 + 1
  1043. elif pattern_len == 45:
  1044. pattern_len_45 = pattern_len_45 + 1
  1045. elif pattern_len == 46:
  1046. pattern_len_46 = pattern_len_46 + 1
  1047. elif pattern_len == 47:
  1048. pattern_len_47 = pattern_len_47 + 1
  1049. elif pattern_len == 48:
  1050. pattern_len_48 = pattern_len_48 + 1
  1051. elif pattern_len == 49:
  1052. pattern_len_49 = pattern_len_49 + 1
  1053. elif pattern_len == 50:
  1054. pattern_len_50 = pattern_len_50 + 1
  1055. else:
  1056. pass
  1057. if pattern_len_1 > 0:
  1058. print(" - [length = 1] : [ "+str(pattern_len_1)+" ]")
  1059. if pattern_len_2 > 0:
  1060. print(" - [length = 2] : [ "+str(pattern_len_2)+" ]")
  1061. if pattern_len_3 > 0:
  1062. print(" - [length = 3] : [ "+str(pattern_len_3)+" ]")
  1063. if pattern_len_4 > 0:
  1064. print(" - [length = 4] : [ "+str(pattern_len_4)+" ]")
  1065. if pattern_len_5 > 0:
  1066. print(" - [length = 5] : [ "+str(pattern_len_5)+" ]")
  1067. if pattern_len_6 > 0:
  1068. print(" - [length = 6] : [ "+str(pattern_len_6)+" ]")
  1069. if pattern_len_7 > 0:
  1070. print(" - [length = 7] : [ "+str(pattern_len_7)+" ]")
  1071. if pattern_len_8 > 0:
  1072. print(" - [length = 8] : [ "+str(pattern_len_8)+" ]")
  1073. if pattern_len_9 > 0:
  1074. print(" - [length = 9] : [ "+str(pattern_len_9)+" ]")
  1075. if pattern_len_10 > 0:
  1076. print(" - [length = 10]: [ "+str(pattern_len_10)+" ]")
  1077. if pattern_len_11 > 0:
  1078. print(" - [length = 11]: [ "+str(pattern_len_11)+" ]")
  1079. if pattern_len_12 > 0:
  1080. print(" - [length = 12]: [ "+str(pattern_len_12)+" ]")
  1081. if pattern_len_13 > 0:
  1082. print(" - [length = 13]: [ "+str(pattern_len_13)+" ]")
  1083. if pattern_len_14 > 0:
  1084. print(" - [length = 14]: [ "+str(pattern_len_14)+" ]")
  1085. if pattern_len_15 > 0:
  1086. print(" - [length = 15]: [ "+str(pattern_len_15)+" ]")
  1087. if pattern_len_16 > 0:
  1088. print(" - [length = 16]: [ "+str(pattern_len_16)+" ]")
  1089. if pattern_len_17 > 0:
  1090. print(" - [length = 17]: [ "+str(pattern_len_17)+" ]")
  1091. if pattern_len_18 > 0:
  1092. print(" - [length = 18]: [ "+str(pattern_len_18)+" ]")
  1093. if pattern_len_19 > 0:
  1094. print(" - [length = 19]: [ "+str(pattern_len_19)+" ]")
  1095. if pattern_len_20 > 0:
  1096. print(" - [length = 20]: [ "+str(pattern_len_20)+" ]")
  1097. if pattern_len_21 > 0:
  1098. print(" - [length = 21]: [ "+str(pattern_len_21)+" ]")
  1099. if pattern_len_22 > 0:
  1100. print(" - [length = 22]: [ "+str(pattern_len_22)+" ]")
  1101. if pattern_len_23 > 0:
  1102. print(" - [length = 23]: [ "+str(pattern_len_23)+" ]")
  1103. if pattern_len_24 > 0:
  1104. print(" - [length = 24]: [ "+str(pattern_len_24)+" ]")
  1105. if pattern_len_25 > 0:
  1106. print(" - [length = 25]: [ "+str(pattern_len_25)+" ]")
  1107. if pattern_len_26 > 0:
  1108. print(" - [length = 26]: [ "+str(pattern_len_26)+" ]")
  1109. if pattern_len_27 > 0:
  1110. print(" - [length = 27]: [ "+str(pattern_len_27)+" ]")
  1111. if pattern_len_28 > 0:
  1112. print(" - [length = 28]: [ "+str(pattern_len_28)+" ]")
  1113. if pattern_len_29 > 0:
  1114. print(" - [length = 29]: [ "+str(pattern_len_29)+" ]")
  1115. if pattern_len_30 > 0:
  1116. print(" - [length = 30]: [ "+str(pattern_len_30)+" ]")
  1117. if pattern_len_31 > 0:
  1118. print(" - [length = 31]: [ "+str(pattern_len_31)+" ]")
  1119. if pattern_len_32 > 0:
  1120. print(" - [length = 32]: [ "+str(pattern_len_32)+" ]")
  1121. if pattern_len_33 > 0:
  1122. print(" - [length = 33]: [ "+str(pattern_len_33)+" ]")
  1123. if pattern_len_34 > 0:
  1124. print(" - [length = 34]: [ "+str(pattern_len_34)+" ]")
  1125. if pattern_len_35 > 0:
  1126. print(" - [length = 35]: [ "+str(pattern_len_35)+" ]")
  1127. if pattern_len_36 > 0:
  1128. print(" - [length = 36]: [ "+str(pattern_len_36)+" ]")
  1129. if pattern_len_37 > 0:
  1130. print(" - [length = 37]: [ "+str(pattern_len_37)+" ]")
  1131. if pattern_len_38 > 0:
  1132. print(" - [length = 38]: [ "+str(pattern_len_38)+" ]")
  1133. if pattern_len_39 > 0:
  1134. print(" - [length = 39]: [ "+str(pattern_len_39)+" ]")
  1135. if pattern_len_40 > 0:
  1136. print(" - [length = 40]: [ "+str(pattern_len_40)+" ]")
  1137. if pattern_len_41 > 0:
  1138. print(" - [length = 41]: [ "+str(pattern_len_41)+" ]")
  1139. if pattern_len_42 > 0:
  1140. print(" - [length = 42]: [ "+str(pattern_len_42)+" ]")
  1141. if pattern_len_43 > 0:
  1142. print(" - [length = 43]: [ "+str(pattern_len_43)+" ]")
  1143. if pattern_len_44 > 0:
  1144. print(" - [length = 44]: [ "+str(pattern_len_44)+" ]")
  1145. if pattern_len_45 > 0:
  1146. print(" - [length = 45]: [ "+str(pattern_len_45)+" ]")
  1147. if pattern_len_46 > 0:
  1148. print(" - [length = 46]: [ "+str(pattern_len_46)+" ]")
  1149. if pattern_len_47 > 0:
  1150. print(" - [length = 47]: [ "+str(pattern_len_47)+" ]")
  1151. if pattern_len_48 > 0:
  1152. print(" - [length = 48]: [ "+str(pattern_len_48)+" ]")
  1153. if pattern_len_49 > 0:
  1154. print(" - [length = 49]: [ "+str(pattern_len_49)+" ]")
  1155. if pattern_len_50 > 0:
  1156. print(" - [length = 50]: [ "+str(pattern_len_50)+" ]")
  1157. def print_banner():
  1158. print("\n"+"="*50)
  1159. print(" ____ _ _ _ _ ")
  1160. print("| _ \(_) __ _| \ | | / \ ")
  1161. print("| | | | |/ _` | \| | / _ \ ")
  1162. print("| |_| | | (_| | |\ |/ ___ \ ")
  1163. print("|____/|_|\__,_|_| \_/_/ \_\ by psy")
  1164. print('\n"Search and Recognize patterns in DNA sequences"')
  1165. print("\n"+"="*50)
  1166. print("+ GENOMES DETECTED:", str(num_files))
  1167. print("="*50)
  1168. print("\n"+"-"*15+"\n")
  1169. print(" * VERSION: ")
  1170. print(" + "+VERSION+" - (rev:"+RELEASE+")")
  1171. print("\n * SOURCES:")
  1172. print(" + "+SOURCE1)
  1173. print(" + "+SOURCE2)
  1174. print("\n * CONTACT: ")
  1175. print(" + "+CONTACT+"\n")
  1176. print("-"*15+"\n")
  1177. print("="*50)
  1178. # sub_init #
  1179. num_files=0
  1180. for file in glob.iglob(genomes_path + '**/*', recursive=True):
  1181. if(file.endswith(".genome")):
  1182. num_files = num_files + 1
  1183. f=open(file, 'r')
  1184. genome = f.read().replace('\n',' ')
  1185. genomes[file.replace("datasets/","")] = genome.upper() # add genome to main dict
  1186. f.close()
  1187. print_banner() # show banner
  1188. option = input("\n+ CHOOSE: (S)earch, (L)ist, (T)rain or (R)eport: ").upper()
  1189. print("")
  1190. print("="*50+"\n")
  1191. if option == "S": # search pattern
  1192. search_pattern_with_human()
  1193. elif option == "L": # list genomes
  1194. list_genomes_on_database()
  1195. elif option == "T": # teach AI
  1196. teach_ai()
  1197. else: # libre AI
  1198. libre_ai()
  1199. print ("="*50+"\n")