def generate_dna_kmers(k):
'''
Return a list of all possible substrings of
length k using only characters A, C, T, and G
'''
bases = ["A", "C", "T", "G"]
last = bases
current = []
for i in range(k-1):
for b in bases:
for l in last:
current.append(l+b)
last = current
current= []
return last
generate_dna_kmers(1)
['A', 'C', 'T', 'G']
generate_dna_kmers(2)
['AA', 'CA', 'TA', 'GA', 'AC', 'CC', 'TC', 'GC', 'AT', 'CT', 'TT', 'GT', 'AG', 'CG', 'TG', 'GG']
generate_dna_kmers(3)
['AAA', 'CAA', 'TAA', 'GAA', 'ACA', 'CCA', 'TCA', 'GCA', 'ATA', 'CTA', 'TTA', 'GTA', 'AGA', 'CGA', 'TGA', 'GGA', 'AAC', 'CAC', 'TAC', 'GAC', 'ACC', 'CCC', 'TCC', 'GCC', 'ATC', 'CTC', 'TTC', 'GTC', 'AGC', 'CGC', 'TGC', 'GGC', 'AAT', 'CAT', 'TAT', 'GAT', 'ACT', 'CCT', 'TCT', 'GCT', 'ATT', 'CTT', 'TTT', 'GTT', 'AGT', 'CGT', 'TGT', 'GGT', 'AAG', 'CAG', 'TAG', 'GAG', 'ACG', 'CCG', 'TCG', 'GCG', 'ATG', 'CTG', 'TTG', 'GTG', 'AGG', 'CGG', 'TGG', 'GGG']
def count_mer(mer, seq):
'''
Counts the number of times a substring mer
ocurrs in the sequence seq (including overlapping
occurrences)
sample use: count_mer("GGG", "AGGGCGGG") => 2
'''
k = len(mer)
count = 0
for i in range(0, len(seq)-k+1):
if mer == seq[i:i+k]:
count = count + 1
return count
count_mer("GGG", "AGGGCGGG")
2
def kmer_count(k, seq):
'''
Return a list of the number of times each possible k-mer appears
in seq, including overlapping occurrences.
'''
mers = generate_dna_kmers(k)
rv = []
for m in mers:
cnt = count_mer(m, seq)
if cnt != 0:
rv.append((m, cnt))
return rv
kmer_count(2, "AAGAGT")
[('AA', 1), ('GA', 1), ('GT', 1), ('AG', 2)]
kmer_count(3, "AAGAGT")
[('AGA', 1), ('AGT', 1), ('AAG', 1), ('GAG', 1)]
seq1 = "AAGGCCTT"
seq2 = "AAAGGGCCCTTT"
seq3 = "ACTGTGCAAACCTTGGGGTTCCAA"
seq4 = "AAGGAAGGCGCGCCGGAAAGAG"
seq5 = "GGGGAGGG"
kmer_count(1, seq4)
[('A', 8), ('C', 4), ('G', 10)]
kmer_count(2, seq4)
[('AA', 4), ('GA', 3), ('CC', 1), ('GC', 3), ('AG', 4), ('CG', 3), ('GG', 3)]
kmer_count(3, seq4)
[('AAA', 1), ('GAA', 2), ('AGA', 1), ('GGA', 2), ('GCC', 1), ('CGC', 2), ('GGC', 1), ('AAG', 3), ('GAG', 1), ('CCG', 1), ('GCG', 2), ('AGG', 2), ('CGG', 1)]
from dnafile import read
seq_drosofila = read("drosofila-small.txt")
len(seq_drosofila)
951900
kmer_count(1, seq_drosofila)
[('A', 278920), ('C', 196552), ('T', 277040), ('G', 199388)]
kmer_count(2, seq_drosofila)
[('AA', 96382), ('CA', 61129), ('TA', 66451), ('GA', 54957), ('AC', 52022), ('CC', 41883), ('TC', 54715), ('GC', 47932), ('AT', 75767), ('CT', 54085), ('TT', 94270), ('GT', 52918), ('AG', 54749), ('CG', 39455), ('TG', 61603), ('GG', 43581)]
%time kmers = kmer_count(1, seq_drosofila)
CPU times: user 404 ms, sys: 0 ns, total: 404 ms Wall time: 404 ms
%time kmers = kmer_count(2, seq_drosofila)
CPU times: user 1.8 s, sys: 0 ns, total: 1.8 s Wall time: 1.8 s
%time kmers = kmer_count(3, seq_drosofila)
CPU times: user 7 s, sys: 0 ns, total: 7 s Wall time: 7 s
%time kmers = kmer_count(4, seq_drosofila)
CPU times: user 29.3 s, sys: 4 ms, total: 29.4 s Wall time: 29.4 s
def kmer_count_better(k, seq):
'''
Return a list of the number of times each possible k-mer appears
in seq, including overlapping occurrences.
'''
rv = {}
for i in range(0, len(seq)-k+1):
subseq = seq[i:i+k]
v = rv.get(subseq, 0)
rv[subseq] = v + 1
return list(rv.items())
seq4
'AAGGAAGGCGCGCCGGAAAGAG'
seq4[3:3+4]
'GAAG'
kmer_count_better(1, seq4)
[('A', 8), ('C', 4), ('G', 10)]
kmer_count_better(2, seq4)
[('GC', 3), ('AA', 4), ('AG', 4), ('CC', 1), ('CG', 3), ('GG', 3), ('GA', 3)]
kmer_count_better(3, seq4)
[('AAG', 3), ('GAA', 2), ('CCG', 1), ('GGA', 2), ('AGA', 1), ('GGC', 1), ('GCG', 2), ('GCC', 1), ('GAG', 1), ('AAA', 1), ('AGG', 2), ('CGC', 2), ('CGG', 1)]
%timeit kmers = kmer_count_better(1, seq_drosofila)
183 ms ± 439 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit kmers = kmer_count_better(2, seq_drosofila)
228 ms ± 2.31 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit kmers = kmer_count_better(3, seq_drosofila)
235 ms ± 1.98 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit kmers = kmer_count_better(4, seq_drosofila)
242 ms ± 4.94 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit kmers = kmer_count_better(100, seq_drosofila)
615 ms ± 13.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit kmers = kmer_count_better(500, seq_drosofila)
875 ms ± 4.26 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit kmers = kmer_count_better(1000, seq_drosofila)
1.26 s ± 65.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)