diana.py 30 KB


  1. #!/usr/bin/env python
  2. # -*- coding: utf-8 -*-"
  3. """
  4. DiaNA - 2020 - by psy (epsylon@riseup.net)
  5. You should have received a copy of the GNU General Public License along
  6. with DiaNA; if not, write to the Free Software Foundation, Inc., 51
  7. Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  8. """
  9. VERSION = "v0.1_beta"
  10. RELEASE = "16032020"
  11. SOURCE1 = "https://code.03c8.net/epsylon/diana"
  12. SOURCE2 = "https://github.com/epsylon/diana"
  13. CONTACT = "epsylon@riseup.net - (https://03c8.net)"
  14. """
  15. DNA-equiv:
  16. A <-> T
  17. C <-> G
  18. """
  19. import re, os, glob, random, time, math
  20. brain_path = "datasets/brain.in" # in/out brain-tmp file
  21. genomes_path = 'datasets/' # genome datasets raw data
  22. genomes_list_path = "datasets/genome.list" # genome list
  23. dna_letters = ["A", "T", "G", "C", "N"] # dna alphabet [n for ANY nucl.]
  24. genomes = {} # main sources dict: genome_name
  25. seeds_checked = [] # list used for random checked patterns
  26. repeats = {} # repetitions 'tmp' dict: genome_name:(repets,pattern)
  27. known_patterns = [] # list used for known patterns
  28. estimated_max_range_for_library_completed = 20 # [MAX. LENGTH] for range [PATTERN]
  29. estimated_patterns_for_library_completed = 1466015503700 # x = y+4^z
  30. estimated_quantity_per_pattern_for_library_completed = int(estimated_patterns_for_library_completed / estimated_max_range_for_library_completed)
  31. def convert_size(size):
  32. if (size == 0):
  33. return '0 B'
  34. size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
  35. i = int(math.floor(math.log(size,1024)))
  36. p = math.pow(1024,i)
  37. s = round(size/p,2)
  38. return s, size_name[i]
  39. def search_pattern_with_human():
  40. pattern = input("[HUMAN] [SEARCH] Pattern (ex: attacg): ").upper()
  41. print("\n"+"-"*5 + "\n")
  42. create_new_pattern(pattern) # create new pattern
  43. def try_pattern_against_all_genomes(pattern):
  44. patterns_found = 0
  45. for k, v in genomes.items():
  46. if pattern in v:
  47. t = len(re.findall(pattern, v))
  48. print (" *", k +":", "-> [",t,"times ]")
  49. repeats[k] = t, pattern
  50. patterns_found = patterns_found + 1
  51. print("")
  52. if patterns_found == 0:
  53. print("[INFO] -> Not any found! ... [EXITING!]\n")
  54. def sanitize_dna_pattern(pattern):
  55. valid_pattern = True
  56. for c in pattern:
  57. if c == "A":
  58. pass
  59. elif c == "T":
  60. pass
  61. elif c == "G":
  62. pass
  63. elif c == "C":
  64. pass
  65. elif c == "N":
  66. pass
  67. else:
  68. valid_pattern = False
  69. return valid_pattern
  70. def teach_ai():
  71. mode = input("[TRAIN-AI] MODE -> (H)uman, (A)utomata: ").upper()
  72. if not os.path.isfile(brain_path):
  73. create_initial_seed_file()
  74. if mode == "H": # human mode
  75. teach_ai_human_mode()
  76. else: # libre AI
  77. teach_ai_automata_mode() # automata mode
  78. def teach_ai_human_mode(): # search/discard patterns with human interaction & generate local database
  79. search_patterns_lesson_with_a_human()
  80. def search_patterns_lesson_with_a_human():
  81. print("\n"+"-"*30)
  82. print("\n[TRAIN-AI] [HUMAN] [STOP] this mode; just entering whatever invalid pattern (ex: 'exit' or 'q').\n")
  83. key = "K" # continue
  84. while key == "K":
  85. pattern = input("[TRAIN-AI] [HUMAN] [LOOP] [SEARCH] Pattern (ex: attacg): ").upper()
  86. print("\n"+"-"*5 + "\n")
  87. key = search_pattern_on_lesson(pattern)
  88. if key == "Z": # stop
  89. break
  90. def search_pattern_on_lesson(pattern):
  91. valid_pattern = sanitize_dna_pattern(pattern)
  92. if valid_pattern == True:
  93. key = search_pattern_on_local_database(pattern) # search pattern on local database
  94. else:
  95. print("[ERROR] -> Invalid DNA pattern ... [EXITING!]\n")
  96. key = "Z" # stop
  97. return key
  98. def search_pattern_on_local_database(pattern):
  99. f=open(brain_path, 'r')
  100. memory = f.read().replace('\n',' ')
  101. f.close()
  102. patterns_known = 0
  103. if not "'"+pattern+"'" in memory: # always create new patterns
  104. create_new_pattern(pattern) # create new pattern
  105. patterns_known = patterns_known + 1
  106. else:
  107. for k, v in genomes.items(): # create patterns found for new genomes
  108. if k not in memory:
  109. create_new_pattern(pattern) # create new pattern
  110. patterns_known = patterns_known + 1
  111. if patterns_known == 0:
  112. print("[TRAIN-AI] [AUTOMATA] [LOOP] [RESULTS] -ALREADY- [LEARNED!] ... -> [GOING FOR NEXT!]\n")
  113. print("-"*5 + "\n")
  114. key = "K" # continue
  115. return key
  116. def create_initial_seed_file():
  117. f=open(brain_path, 'w')
  118. f.write("")
  119. f.close()
  120. def create_new_pattern(pattern): # append it to brain
  121. valid_pattern = sanitize_dna_pattern(pattern)
  122. if valid_pattern == True:
  123. if pattern not in known_patterns:
  124. known_patterns.append(pattern)
  125. try_pattern_against_all_genomes(pattern)
  126. f=open(brain_path, 'a')
  127. f.write(str(repeats)+os.linesep) # add dict as str
  128. f.close()
  129. def teach_ai_automata_mode(): # search patterns by bruteforcing ranges & generate local database
  130. search_patterns_lesson_with_an_ai()
  131. def search_patterns_lesson_with_an_ai():
  132. print("\n"+"-"*30)
  133. print("\n[TRAIN-AI] [AUTOMATA] [STOP] this mode; pressing 'CTRL+z'.\n")
  134. ranges = input("[TRAIN-AI] [AUTOMATA] [SEARCH] Set range (x<y) for pattern deep searching (ex: 2-8): ")
  135. print ("")
  136. valid_range, ranged_permutations = check_for_deep_searching_ranges(ranges)
  137. if str(valid_range) == "OK!":
  138. ranged_ending = False
  139. print("-"*15)
  140. print("\n[TRAIN-AI] [AUTOMATA] [SEARCH] Number of [PERMUTATIONS] estimated: [ "+str(ranged_permutations)+" ]\n")
  141. print("-"*15+"\n")
  142. num_pat = 0
  143. time.sleep(10)
  144. while ranged_ending == False: # try to STOP it using: CTRL-z
  145. try:
  146. pattern, ranged_ending = generate_random_pattern(ranges, ranged_permutations) # generate random seed
  147. if pattern:
  148. num_pat = num_pat + 1
  149. print("[TRAIN-AI] [AUTOMATA] [LOOP] [SEARCH] Generating [RANDOM!] ["+str(num_pat)+"/"+str(ranged_permutations)+"] pattern: [ " + str(pattern) + " ]\n")
  150. if not num_pat == ranged_permutations:
  151. search_pattern_on_lesson(pattern)
  152. else:
  153. search_pattern_on_lesson(pattern)
  154. print("[TRAIN-AI] [AUTOMATA] [RESULTS]: REVIEWED -> [ "+str(ranged_permutations)+" PERMUTATIONS ] ... -> [EXITING!]\n")
  155. ranged_ending = True
  156. except:
  157. pass
  158. else:
  159. print("-"*15+"\n")
  160. print("[TRAIN-AI] [AUTOMATA] [ERROR] -> [INVALID!] Deep Learning [RANGE] -> "+valid_range+" ... [EXITING!]\n")
  161. def generate_random_pattern(ranges, ranged_permutations):
  162. ranged_length = 0
  163. try:
  164. range_low = int(ranges.split("-")[0])
  165. range_high = int(ranges.split("-")[1])
  166. for i in range(range_low, range_high+1):
  167. ranged_length = ranged_length + 1
  168. if ranged_length == ranged_permutations: # all possible variables have been bruteforced/checked! -> exit
  169. pattern = None
  170. ranged_ending = True
  171. return pattern, ranged_ending
  172. else:
  173. ranged_ending = False
  174. seed = [random.randrange(0, 4) for _ in range(i)] # generate "random" seed
  175. if seed not in seeds_checked:
  176. seeds_checked.append(seed)
  177. pattern = ""
  178. for n in seed:
  179. if n == 0:
  180. pattern += "A"
  181. elif n == 1:
  182. pattern += "C"
  183. elif n == 2:
  184. pattern += "T"
  185. else:
  186. pattern += "G"
  187. return pattern, ranged_ending
  188. except:
  189. print("[TRAIN-AI] [AUTOMATA] [ERROR] -> [INVALID!] Deep Learning [RANGE] ... [EXITING!]\n")
  190. pattern = None
  191. ranged_ending = True
  192. return pattern, ranged_ending
  193. def check_for_deep_searching_ranges(ranges):
  194. try:
  195. range_low = ranges.split("-")[0]
  196. range_high = ranges.split("-")[1]
  197. except:
  198. valid_range = "'bad format'"
  199. try:
  200. range_low = int(range_low)
  201. except:
  202. valid_range = "'low range' should be an integer"
  203. try:
  204. range_high = int(range_high)
  205. except:
  206. valid_range = "'high range' should be an integer"
  207. try:
  208. if range_low < range_high:
  209. if range_low > 1: # always range > 1
  210. valid_range = "OK!"
  211. else:
  212. valid_range = "'low range' should be > than 1"
  213. else:
  214. valid_range = "'low range' should be < than 'high range'"
  215. except:
  216. valid_range = "'bad format'"
  217. try:
  218. ranged_permutations = math_ranged_permutations(range_low, range_high)
  219. except:
  220. ranged_permutations = 0
  221. valid_range = "'bad format'"
  222. return valid_range, ranged_permutations
  223. def math_ranged_permutations(range_low, range_high): # calculate ranged_permutations
  224. ranged_permutations = 0
  225. for i in range(range_low, range_high+1):
  226. ranged_permutations = ranged_permutations + (4**i)
  227. return ranged_permutations
  228. def libre_ai(): # show statistics / download new genomes / keep crossing new genomes with local database / search for new patterns (non stop!)
  229. if not os.path.isfile(brain_path):
  230. create_initial_seed_file()
  231. memory = examine_stored_brain_memory()
  232. if memory != "":
  233. #print("[LIBRE-AI] [STOP] this mode; pressing 'CTRL+z'.\n")
  234. libre_ai_show_statistics(memory) # show statistics
  235. def libre_ai_show_statistics(memory):
  236. print("[LIBRE-AI] [REPORTING] [STATISTICS] ... -> [STARTING!]\n")
  237. print("-"*15 + "\n")
  238. total_genomes = 0
  239. total_adenine = 0
  240. total_guanine = 0
  241. total_cytosine = 0
  242. total_thymine = 0
  243. total_any = 0
  244. secuence_length = 0
  245. secuences_list = {}
  246. largest = None
  247. largest_len = 0
  248. shortest_len = 0
  249. average = None
  250. shortest = None
  251. for k, v in genomes.items():
  252. secuence_length = len(v)
  253. secuences_list[k] = str(secuence_length)
  254. total_genomes = total_genomes + 1
  255. total_adenine = total_adenine + v.count("A")
  256. total_guanine = total_guanine + v.count("G")
  257. total_cytosine = total_cytosine + v.count("C")
  258. total_thymine = total_thymine + v.count("T")
  259. total_any = total_any + v.count("N")
  260. largest = max(secuences_list, key=secuences_list.get)
  261. shortest = min(secuences_list, key=secuences_list.get)
  262. for k, v in genomes.items():
  263. if k == largest:
  264. largest_len = len(v)
  265. elif k == shortest:
  266. shortest_len = len(v)
  267. else:
  268. pass
  269. path = genomes_path # genome datasets raw data
  270. l = glob.glob(genomes_path+"*") # black magic!
  271. latest_collection_file = max(l, key=os.path.getctime)
  272. latest_collection_date = time.ctime(os.path.getmtime(latest_collection_file))
  273. total_nucleotids = [total_adenine, total_guanine, total_cytosine, total_thymine, total_any]
  274. num_total_nucleotids = total_adenine + total_guanine + total_cytosine + total_thymine + total_any
  275. nucleotid_more_present = max(total_nucleotids)
  276. print("[LIBRE-AI] [REPORTING] -STORAGE- [STATISTICS]: \n")
  277. extract_storage_sizes()
  278. print(" * [LATEST UPDATE]: '"+str(latest_collection_date)+"'\n")
  279. print(" + File: '"+str(latest_collection_file)+"'\n")
  280. print("-"*5 + "\n")
  281. print("[LIBRE-AI] [REPORTING] -COLLECTION- [STATISTICS]: \n")
  282. extract_total_patterns_learned_from_local(memory)
  283. print("\n"+"-"*5 + "\n")
  284. print("[LIBRE-AI] [REPORTING] -ANALYSIS- [STATISTICS]: \n")
  285. print(" * Total [DNA SECUENCES]: [ "+str(total_genomes)+" ]\n")
  286. print(" + [LARGEST]: "+str(largest)+ " [ "+str(largest_len)+" bp linear RNA ]")
  287. print(" + [SHORTEST]: "+str(shortest)+ " [ "+str(shortest_len)+" bp linear RNA ]\n")
  288. print(" * Total [NUCLEOTIDS]: [ "+str(num_total_nucleotids)+" ]\n")
  289. if nucleotid_more_present == total_adenine:
  290. print(" + [A] Adenine : "+str(total_adenine)+" <- [MAX]")
  291. else:
  292. print(" + [A] Adenine : "+str(total_adenine))
  293. if nucleotid_more_present == total_guanine:
  294. print(" + [G] Guanine : "+str(total_guanine)+" <- [MAX]")
  295. else:
  296. print(" + [G] Guanine : "+str(total_guanine))
  297. if nucleotid_more_present == total_cytosine:
  298. print(" + [C] Cytosine : "+str(total_cytosine)+" <- [MAX]")
  299. else:
  300. print(" + [C] Cytosine : "+str(total_cytosine))
  301. if nucleotid_more_present == total_thymine:
  302. print(" + [T] Thymine : "+str(total_thymine)+" <- [MAX]")
  303. else:
  304. print(" + [T] Thymine : "+str(total_thymine))
  305. if total_any > 0:
  306. if nucleotid_more_present == total_any:
  307. print(" + [N] *ANY* : "+str(total_any)+" <- [MAX]\n")
  308. else:
  309. print(" + [N] *ANY* : "+str(total_any)+"\n")
  310. print("-"*5 + "\n")
  311. extract_pattern_most_present_local(memory)
  312. def convert_memory_to_dict(memory): # [index] = genome_name, pattern, num_rep
  313. memory_dict = {}
  314. index = 0
  315. for m in memory:
  316. regex_record = "'(.+?)': (.+?), '(.+?)'" # regex magics! - extract first each record
  317. pattern_record = re.compile(regex_record)
  318. record = re.findall(pattern_record, m)
  319. for r in record: # now extract each field
  320. index = index + 1
  321. name = str(r).split("', '(")[0]
  322. genome_name = str(name).split("'")[1]
  323. repeats = str(r).split("', '(")[1]
  324. genome_repeats = str(repeats).split("',")[0]
  325. pattern = str(repeats).split("',")[1]
  326. genome_pattern = pattern.replace(" ", "")
  327. genome_pattern = genome_pattern.replace("'", "")
  328. genome_pattern = genome_pattern.replace(")", "")
  329. memory_dict[index] = genome_name, genome_pattern, genome_repeats # generate memory_dict!
  330. return memory_dict
  331. def extract_pattern_most_present_local(memory):
  332. memory_dict = convert_memory_to_dict(memory)
  333. if memory_dict:
  334. print("[LIBRE-AI] [REPORTING] -RESEARCHING- [STATISTICS]: \n")
  335. total_patterns_found = 0
  336. total_genomes = 0
  337. for k, v in memory_dict.items():
  338. total_patterns_found = total_patterns_found + 1
  339. for k, v in genomes.items():
  340. total_genomes = total_genomes + 1
  341. print(" * [ "+str(total_patterns_found)+" ] [PATTERNS FOUND!] in: [ "+str(total_genomes)+ " ] [DNA SECUENCES]\n")
  342. def extract_storage_sizes():
  343. total_dataset_size = 0
  344. total_files_size = 0
  345. total_list_size = 0
  346. for file in glob.iglob(genomes_path + '**/*', recursive=True):
  347. if(file.endswith(".genome")):
  348. total_dataset_size = total_dataset_size + len(file)
  349. elif(file.endswith(".in")):
  350. total_brain_size = len(file)
  351. elif(file.endswith(".list")):
  352. total_list_size = len(file)
  353. if total_dataset_size > 0:
  354. total_files_size = int(total_files_size) + int(total_dataset_size)
  355. dataset_s, dataset_size_name = convert_size(total_dataset_size)
  356. total_dataset_size = '%s %s' % (dataset_s,dataset_size_name)
  357. if total_brain_size > 0:
  358. total_files_size = int(total_files_size) + int(total_brain_size)
  359. brain_s, brain_size_name = convert_size(total_brain_size)
  360. total_brain_size = '%s %s' % (brain_s,brain_size_name)
  361. if total_list_size > 0:
  362. total_files_size = int(total_files_size) + int(total_list_size)
  363. list_s, list_size_name = convert_size(total_list_size)
  364. total_list_size = '%s %s' % (list_s,list_size_name)
  365. total_s, total_size_name = convert_size(total_files_size)
  366. total_files_size = '%s %s' % (total_s,total_size_name)
  367. print(" * Total [FILE SIZES]: "+str(total_files_size)+"\n")
  368. if total_dataset_size:
  369. print(" + [DATASET]: "+str(total_dataset_size))
  370. if total_list_size:
  371. print(" + [LIST]: "+str(total_list_size))
  372. if total_brain_size:
  373. print(" + [BRAIN]: "+str(total_brain_size)+"\n")
  374. def extract_total_patterns_learned_from_local(memory):
  375. total_patterns = 0
  376. for m in memory:
  377. total_patterns = total_patterns + 1
  378. print(" * [SETTINGS] Using [MAX. LENGTH] for range [PATTERN] = "+str(estimated_max_range_for_library_completed)+"\n")
  379. if total_patterns < estimated_patterns_for_library_completed:
  380. library_completion = (total_patterns/estimated_patterns_for_library_completed)*100
  381. print(" + [LIBRARY COMPLETED]: [ "+str('%.20f' % library_completion)+"% ]")
  382. if total_patterns > 0:
  383. print(" + [PATTERNS LEARNED!]: [ "+str(total_patterns)+" / "+str(estimated_patterns_for_library_completed)+" ] \n")
  384. else:
  385. print(" + [PATTERNS LEARNED!]: [ "+str(total_patterns)+" / "+str(estimated_patterns_for_library_completed)+" ]")
  386. else:
  387. total_current_library_completion = (total_patterns/estimated_patterns_for_library_completed)*100
  388. library_completion = 100
  389. print(" + [LIBRARY COMPLETED]: [ "+str(library_completion)+"% ]")
  390. print(" + [CURRENT LIBRARY] : [ "+str('%.00f' % total_current_library_completion)+"% ] -> [ATTENTION!]: INCREASED [MAX. LENGTH] for range [PATTERN] -> REQUIRED!")
  391. if total_patterns > 0:
  392. print(" + [PATTERNS LEARNED!]: [ "+str(total_patterns)+" ]\n")
  393. else:
  394. print(" + [PATTERNS LEARNED!]: [ "+str(total_patterns)+" ]")
  395. pattern_len_1 = 0
  396. pattern_len_2 = 0
  397. pattern_len_3 = 0
  398. pattern_len_4 = 0
  399. pattern_len_5 = 0
  400. pattern_len_6 = 0
  401. pattern_len_7 = 0
  402. pattern_len_8 = 0
  403. pattern_len_9 = 0
  404. pattern_len_10 = 0
  405. pattern_len_11 = 0
  406. pattern_len_12 = 0
  407. pattern_len_13 = 0
  408. pattern_len_14 = 0
  409. pattern_len_15 = 0
  410. pattern_len_16 = 0
  411. pattern_len_17 = 0
  412. pattern_len_18 = 0
  413. pattern_len_19 = 0
  414. pattern_len_20 = 0
  415. for m in memory:
  416. pattern_len = m.split(", '")[1]
  417. pattern_len = pattern_len.split("')")[0]
  418. pattern_len = len(pattern_len)
  419. if pattern_len == 1:
  420. pattern_len_1 = pattern_len_1 + 1
  421. elif pattern_len == 2:
  422. pattern_len_2 = pattern_len_2 + 1
  423. elif pattern_len == 3:
  424. pattern_len_3 = pattern_len_3 + 1
  425. elif pattern_len == 4:
  426. pattern_len_4 = pattern_len_4 + 1
  427. elif pattern_len == 5:
  428. pattern_len_5 = pattern_len_5 + 1
  429. elif pattern_len == 6:
  430. pattern_len_6 = pattern_len_6 + 1
  431. elif pattern_len == 7:
  432. pattern_len_7 = pattern_len_7 + 1
  433. elif pattern_len == 8:
  434. pattern_len_8 = pattern_len_8 + 1
  435. elif pattern_len == 9:
  436. pattern_len_9 = pattern_len_9 + 1
  437. elif pattern_len == 10:
  438. pattern_len_10 = pattern_len_10 + 1
  439. elif pattern_len == 11:
  440. pattern_len_11 = pattern_len_11 + 1
  441. elif pattern_len == 12:
  442. pattern_len_12 = pattern_len_12 + 1
  443. elif pattern_len == 13:
  444. pattern_len_13 = pattern_len_13 + 1
  445. elif pattern_len == 14:
  446. pattern_len_14 = pattern_len_14 + 1
  447. elif pattern_len == 15:
  448. pattern_len_15 = pattern_len_15 + 1
  449. elif pattern_len == 16:
  450. pattern_len_16 = pattern_len_16 + 1
  451. elif pattern_len == 17:
  452. pattern_len_17 = pattern_len_17 + 1
  453. elif pattern_len == 18:
  454. pattern_len_18 = pattern_len_18 + 1
  455. elif pattern_len == 19:
  456. pattern_len_19 = pattern_len_19 + 1
  457. else:
  458. pattern_len_20 = pattern_len_20 + 1
  459. if pattern_len_1 < 101:
  460. progression_len_1 = pattern_len_1 * "*"
  461. else:
  462. progression_len_1 = 100 * "*+"+str(pattern_len_1-100)
  463. if pattern_len_2 < 101:
  464. progression_len_2 = pattern_len_2 * "*"
  465. else:
  466. progression_len_2 = 100 * "*+"+str(pattern_len_2-100)
  467. if pattern_len_3 < 101:
  468. progression_len_3 = pattern_len_3 * "*"
  469. else:
  470. progression_len_3 = 100 * "*+"+str(pattern_len_3-100)
  471. if pattern_len_4 < 101:
  472. progression_len_4 = pattern_len_4 * "*"
  473. else:
  474. progression_len_4 = 100 * "*"+" 100+"+str(pattern_len_4-100)
  475. if pattern_len_5 < 101:
  476. progression_len_5 = pattern_len_5 * "*"
  477. else:
  478. progression_len_5 = 100 * "*+"+str(pattern_len_5-100)
  479. if pattern_len_6 < 101:
  480. progression_len_6 = pattern_len_6 * "*"
  481. else:
  482. progression_len_6 = 100 * "*+"+str(pattern_len_6-100)
  483. if pattern_len_7 < 101:
  484. progression_len_7 = pattern_len_7 * "*"
  485. else:
  486. progression_len_7 = 100 * "*+"+str(pattern_len_7-100)
  487. if pattern_len_8 < 101:
  488. progression_len_8 = pattern_len_8 * "*"
  489. else:
  490. progression_len_8 = 100 * "*+"+str(pattern_len_8-100)
  491. if pattern_len_9 < 101:
  492. progression_len_9 = pattern_len_9 * "*"
  493. else:
  494. progression_len_9 = 100 * "*+"+str(pattern_len_9-100)
  495. if pattern_len_10 < 101:
  496. progression_len_10 = pattern_len_10 * "*"
  497. else:
  498. progression_len_10 = 100 * "*+"+str(pattern_len_10-100)
  499. if pattern_len_11 < 101:
  500. progression_len_11 = pattern_len_11 * "*"
  501. else:
  502. progression_len_11 = 100 * "*+"+str(pattern_len_11-100)
  503. if pattern_len_12 < 101:
  504. progression_len_12 = pattern_len_12 * "*"
  505. else:
  506. progression_len_12 = 100 * "*+"+str(pattern_len_12-100)
  507. if pattern_len_13 < 101:
  508. progression_len_13 = pattern_len_13 * "*"
  509. else:
  510. progression_len_13 = 100 * "*+"+str(pattern_len_13-100)
  511. if pattern_len_14 < 101:
  512. progression_len_14 = pattern_len_14 * "*"
  513. else:
  514. progression_len_14 = 100 * "*+"+str(pattern_len_14-100)
  515. if pattern_len_15 < 101:
  516. progression_len_15 = pattern_len_15 * "*"
  517. else:
  518. progression_len_15 = 100 * "*+"+str(pattern_len_15-100)
  519. if pattern_len_16 < 101:
  520. progression_len_16 = pattern_len_16 * "*"
  521. else:
  522. progression_len_16 = 100 * "*+"+str(pattern_len_16-100)
  523. if pattern_len_17 < 101:
  524. progression_len_17 = pattern_len_17 * "*"
  525. else:
  526. progression_len_17 = 100 * "*+"+str(pattern_len_17-100)
  527. if pattern_len_18 < 101:
  528. progression_len_18 = pattern_len_18 * "*"
  529. else:
  530. progression_len_18 = 100 * "*+"+str(pattern_len_18-100)
  531. if pattern_len_19 < 101:
  532. progression_len_19 = pattern_len_19 * "*"
  533. else:
  534. progression_len_19 = 100 * "*+"+str(pattern_len_19-100)
  535. if pattern_len_20 < 101:
  536. progression_len_20 = pattern_len_20 * "*"
  537. else:
  538. progression_len_20 = 100 * "*+"+str(pattern_len_20-100)
  539. if pattern_len_1 > 0:
  540. print(" - [length = 1] | "+progression_len_1 + " [ "+str(pattern_len_1)+" / "+str(estimated_quantity_per_pattern_for_library_completed)+" ]")
  541. if pattern_len_2 > 0:
  542. print(" - [length = 2] | "+progression_len_2 + " [ "+str(pattern_len_2)+" / "+str(estimated_quantity_per_pattern_for_library_completed)+" ]")
  543. if pattern_len_3 > 0:
  544. print(" - [length = 3] | "+progression_len_3 + " [ "+str(pattern_len_3)+" / "+str(estimated_quantity_per_pattern_for_library_completed)+" ]")
  545. if pattern_len_4 > 0:
  546. print(" - [length = 4] | "+progression_len_4 + " [ "+str(pattern_len_4)+" / "+str(estimated_quantity_per_pattern_for_library_completed)+" ]")
  547. if pattern_len_5 > 0:
  548. print(" - [length = 5] | "+progression_len_5 + " [ "+str(pattern_len_5)+" / "+str(estimated_quantity_per_pattern_for_library_completed)+" ]")
  549. if pattern_len_6 > 0:
  550. print(" - [length = 6] | "+progression_len_6 + " [ "+str(pattern_len_6)+" / "+str(estimated_quantity_per_pattern_for_library_completed)+" ]")
  551. if pattern_len_7 > 0:
  552. print(" - [length = 7] | "+progression_len_7 + " [ "+str(pattern_len_7)+" / "+str(estimated_quantity_per_pattern_for_library_completed)+" ]")
  553. if pattern_len_8 > 0:
  554. print(" - [length = 8] | "+progression_len_8 + " [ "+str(pattern_len_8)+" / "+str(estimated_quantity_per_pattern_for_library_completed)+" ]")
  555. if pattern_len_9 > 0:
  556. print(" - [length = 9] | "+progression_len_9 + " [ "+str(pattern_len_9)+" / "+str(estimated_quantity_per_pattern_for_library_completed)+" ]")
  557. if pattern_len_10 > 0:
  558. print(" - [length = 10] | "+progression_len_10 + " [ "+str(pattern_len_10)+" / "+str(estimated_quantity_per_pattern_for_library_completed)+" ]")
  559. if pattern_len_11 > 0:
  560. print(" - [length = 11] | "+progression_len_11 + " [ "+str(pattern_len_11)+" / "+str(estimated_quantity_per_pattern_for_library_completed)+" ]")
  561. if pattern_len_12 > 0:
  562. print(" - [length = 12] | "+progression_len_12 + " [ "+str(pattern_len_12)+" / "+str(estimated_quantity_per_pattern_for_library_completed)+" ]")
  563. if pattern_len_13 > 0:
  564. print(" - [length = 13] | "+progression_len_13 + " [ "+str(pattern_len_13)+" / "+str(estimated_quantity_per_pattern_for_library_completed)+" ]")
  565. if pattern_len_14 > 0:
  566. print(" - [length = 14] | "+progression_len_14 + " [ "+str(pattern_len_14)+" / "+str(estimated_quantity_per_pattern_for_library_completed)+" ]")
  567. if pattern_len_15 > 0:
  568. print(" - [length = 15] | "+progression_len_15 + " [ "+str(pattern_len_15)+" / "+str(estimated_quantity_per_pattern_for_library_completed)+" ]")
  569. if pattern_len_16 > 0:
  570. print(" - [length = 16] | "+progression_len_16 + " [ "+str(pattern_len_16)+" / "+str(estimated_quantity_per_pattern_for_library_completed)+" ]")
  571. if pattern_len_17 > 0:
  572. print(" - [length = 17] | "+progression_len_17 + " [ "+str(pattern_len_17)+" / "+str(estimated_quantity_per_pattern_for_library_completed)+" ]")
  573. if pattern_len_18 > 0:
  574. print(" - [length = 18] | "+progression_len_18 + " [ "+str(pattern_len_18)+" / "+str(estimated_quantity_per_pattern_for_library_completed)+" ]")
  575. if pattern_len_19 > 0:
  576. print(" - [length = 19] | "+progression_len_19 + " [ "+str(pattern_len_19)+" / "+str(estimated_quantity_per_pattern_for_library_completed)+" ]")
  577. if pattern_len_20 > 0:
  578. print(" - [length => 20] | "+progression_len_20 + " [ "+str(pattern_len_20)+" / "+str(estimated_quantity_per_pattern_for_library_completed)+" ]")
  579. return memory
  580. def list_genomes_on_database():
  581. print("[LIST] [REPORTING] [DNA SECUENCES] ... -> [STARTING!]\n")
  582. print("-"*15 + "\n")
  583. f=open(genomes_list_path, 'w')
  584. for k, v in genomes.items():
  585. print ("*"+str(k)+ "-> [ "+str(len(v))+" bp linear RNA ]")
  586. print (" + [A] Adenine :", str(v.count("A")))
  587. print (" + [G] Guanine :", str(v.count("G")))
  588. print (" + [C] Cytosine :", str(v.count("C")))
  589. print (" + [T] Thymine :", str(v.count("T")))
  590. f.write(str("*"+ str(k)+ " -> [ "+str(len(v))+"bp linear RNA ]\n"))
  591. f.write(str(" + [A] Adenine : " + str(v.count("A"))+"\n"))
  592. f.write(str(" + [G] Guanine : " + str(v.count("G"))+"\n"))
  593. f.write(str(" + [C] Cytosine : " + str(v.count("C"))+"\n"))
  594. f.write(str(" + [T] Thymine : " + str(v.count("T"))+"\n"))
  595. if v.count("N") > 0:
  596. print (" + [N] *ANY* :", str(v.count("N")))
  597. f.write(str(" + [N] *ANY* : "+ str(v.count("N"))+"\n"))
  598. print ("")
  599. f.write("\n")
  600. print("-"*15 + "\n")
  601. print ("[LIST] [INFO] [SAVED!] at: '"+str(genomes_list_path)+"'... -> [EXITING!]\n")
  602. f.close()
  603. def examine_stored_brain_memory():
  604. memory = [] # list used as hot-memory
  605. f=open(brain_path, 'r')
  606. for line in f.readlines():
  607. if line not in memory:
  608. memory.append(line)
  609. f.close()
  610. if memory == "": # first time run!
  611. print ("[LIBRE-AI] [INFO] Not any [BRAIN] present ... -> [BUILDING ONE!]\n")
  612. print("-"*15 + "\n")
  613. for i in range(2, 11+1):
  614. seed = [random.randrange(0, 4) for _ in range(i)] # generate "static" genesis seed
  615. if seed not in seeds_checked:
  616. seeds_checked.append(seed)
  617. pattern = ""
  618. for n in seed:
  619. if n == 0:
  620. pattern += "A"
  621. elif n == 1:
  622. pattern += "C"
  623. elif n == 2:
  624. pattern += "T"
  625. else:
  626. pattern += "G"
  627. print("[LIBRE-AI] [SEARCH] Generating [RANDOM] pattern: " + str(pattern) + "\n")
  628. create_new_pattern(pattern) # create new pattern
  629. print("-"*15 + "\n")
  630. print ("[LIBRE-AI] [INFO] A new [BRAIN] has been created !!! ... -> [ADVANCING!]\n")
  631. f=open(brain_path, 'r')
  632. memory = f.read().replace('\n',' ')
  633. f.close()
  634. return memory
  635. def print_banner():
  636. print("\n"+"="*50)
  637. print(" ____ _ _ _ _ ")
  638. print("| _ \(_) __ _| \ | | / \ ")
  639. print("| | | | |/ _` | \| | / _ \ ")
  640. print("| |_| | | (_| | |\ |/ ___ \ ")
  641. print("|____/|_|\__,_|_| \_/_/ \_\ by psy")
  642. print('\n"Search and Recognize patterns in DNA sequences"')
  643. print("\n"+"="*50)
  644. print("+ GENOMES DETECTED:", str(num_files))
  645. print("="*50)
  646. print("\n"+"-"*15+"\n")
  647. print(" * VERSION: ")
  648. print(" + "+VERSION+" - (rev:"+RELEASE+")")
  649. print("\n * SOURCES:")
  650. print(" + "+SOURCE1)
  651. print(" + "+SOURCE2)
  652. print("\n * CONTACT: ")
  653. print(" + "+CONTACT+"\n")
  654. print("-"*15+"\n")
  655. print("="*50)
  656. # sub_init #
  657. num_files=0
  658. for file in glob.iglob(genomes_path + '**/*', recursive=True):
  659. if(file.endswith(".genome")):
  660. num_files = num_files + 1
  661. f=open(file, 'r')
  662. genome = f.read().replace('\n',' ')
  663. genomes[file.replace("datasets/","")] = genome.upper() # add genome to main dict
  664. f.close()
  665. print_banner() # show banner
  666. option = input("\n+ CHOOSE: (S)earch, (L)ist, (T)rain or (R)eport: ").upper()
  667. print("")
  668. print("="*50+"\n")
  669. if option == "S": # search pattern
  670. search_pattern_with_human()
  671. elif option == "L": # list genomes
  672. list_genomes_on_database()
  673. elif option == "T": # teach AI
  674. teach_ai()
  675. else: # libre AI
  676. libre_ai()
  677. print ("="*50+"\n")