Skip to content

Latest commit

 

History

History
87 lines (63 loc) · 2.47 KB

python_regexp_solutions.md

File metadata and controls

87 lines (63 loc) · 2.47 KB

##Accessions

  import re
  
  accs = "xkn59438", "yhdck2", "eihd39d9", "chdsye847", "hedle3455", "xjhd53e", "45da", "de37dp"
  print("containing the number 5")
  for acc in accs:
      if re.search(r"5", acc):
          print("\t" + acc)
  
  print("containing d or e ")
  for acc in accs:
      if re.search(r"(d|e)", acc):
          print("\t" + acc)
  
  print("containing d followed by e ")
  for acc in accs:
      if re.search(r"d.*e", acc):
          print("\t" + acc)
  
  print("containing d followed by e with a single letter between them ")
  for acc in accs:
      if re.search(r"d.e", acc):
          print("\t" + acc)
  
  print("containing d and e in any order")
  for acc in accs:
      if re.search(r"d.*e", acc) or re.search(r"e.*d", acc):
          print("\t" + acc)
  
  
  print("starting with x or y")
  for acc in accs:
      if re.search(r"^(x|y)", acc):
          print("\t" + acc)
  
  
  print("starting with x or y and ending with e")
  for acc in accs:
      if re.search(r"^(x|y).*e$", acc):
          print("\t" + acc)
  
  
  print("three or more numbers in a row")
  for acc in accs:
      if re.search(r"[0123456789]{3,100}", acc):
          print("\t" + acc)
  
  
  print("three or more numbers in a row, more concisely")
  for acc in accs:
      if re.search(r"\d{3,}", acc):
          print("\t" + acc)
  
  
  print("ends with d and either a, r or p")
  for acc in accs:
      if re.search(r"d[arp]$", acc):
          print("\t" + acc)

##Double digest

  import re
  dna = open("dna.txt").read().rstrip("\n")
  print(str(len(dna)))
  all_cuts = [0]
  
  # add cut positions for AbcI
  for match in re.finditer(r"A[ATGC]TAAT", dna):
      all_cuts.append(match.start() + 3)
  
  # add cut positions for AbcII
  for match in re.finditer(r"GC[AG][AT]TG", dna):
      all_cuts.append(match.start() + 4)
  
  # add the final position
  all_cuts.append(len(dna))
  sorted_cuts = sorted(all_cuts)
  print(sorted_cuts)
  
  
  for i in range(1,len(sorted_cuts)):
      this_cut_position = sorted_cuts[i]
      previous_cut_position = sorted_cuts[i-1]
      fragment_size = this_cut_position - previous_cut_position
      print("one fragment size is  "  + str(fragment_size))