def generate_dna_kmers(k):
'''
Return a list of all possible substrings of
length k using only characters A, C, T, and G
'''
bases = ["A", "C", "T", "G"]
last = bases
current = []
for i in range(k-1):
for b in bases:
for l in last:
current.append(l+b)
last = current
current= []
return last
generate_dna_kmers(1)
generate_dna_kmers(2)
generate_dna_kmers(3)
def count_mer(mer, seq):
'''
Counts the number of times a substring mer
ocurrs in the sequence seq (including overlapping
occurrences)
sample use: count_mer("GGG", "AGGGCGGG") => 2
'''
k = len(mer)
count = 0
for i in range(0, len(seq)-k+1):
if mer == seq[i:i+k]:
count = count + 1
return count
count_mer("GGG", "AGGGCGGG")
def kmer_count(k, seq):
'''
Return a list of the number of times each possible k-mer appears
in seq, including overlapping occurrences.
'''
mers = generate_dna_kmers(k)
rv = []
for m in mers:
cnt = count_mer(m, seq)
if cnt != 0:
rv.append((m, cnt))
return rv
kmer_count(2, "AAGAGT")
kmer_count(3, "AAGAGT")
seq1 = "AAGGCCTT"
seq2 = "AAAGGGCCCTTT"
seq3 = "ACTGTGCAAACCTTGGGGTTCCAA"
seq4 = "AAGGAAGGCGCGCCGGAAAGAG"
seq5 = "GGGGAGGG"
kmer_count(1, seq4)
kmer_count(2, seq4)
kmer_count(3, seq4)
from dnafile import read
seq_drosofila = read("drosofila-small.txt")
len(seq_drosofila)
kmer_count(1, seq_drosofila)
kmer_count(2, seq_drosofila)
%time kmers = kmer_count(1, seq_drosofila)
%time kmers = kmer_count(2, seq_drosofila)
%time kmers = kmer_count(3, seq_drosofila)
%time kmers = kmer_count(4, seq_drosofila)
def kmer_count_better(k, seq):
'''
Return a list of the number of times each possible k-mer appears
in seq, including overlapping occurrences.
'''
rv = {}
for i in range(0, len(seq)-k+1):
subseq = seq[i:i+k]
v = rv.get(subseq, 0)
rv[subseq] = v + 1
return list(rv.items())
seq4
seq4[3:3+4]
kmer_count_better(1, seq4)
kmer_count_better(2, seq4)
kmer_count_better(3, seq4)
%timeit kmers = kmer_count_better(1, seq_drosofila)
%timeit kmers = kmer_count_better(2, seq_drosofila)
%timeit kmers = kmer_count_better(3, seq_drosofila)
%timeit kmers = kmer_count_better(4, seq_drosofila)
%timeit kmers = kmer_count_better(100, seq_drosofila)
%timeit kmers = kmer_count_better(500, seq_drosofila)
%timeit kmers = kmer_count_better(1000, seq_drosofila)