Counting k-mers

In [1]:
def generate_dna_kmers(k):
    '''
    Return a list of all possible substrings of
    length k using only characters A, C, T, and G
    '''
    bases = ["A", "C", "T", "G"]

    last = bases
    current = []
    for i in range(k-1):
        for b in bases:
            for l in last:
                current.append(l+b)
        last = current
        current= []
    return last
In [2]:
generate_dna_kmers(1)
Out[2]:
['A', 'C', 'T', 'G']
In [3]:
generate_dna_kmers(2)
Out[3]:
['AA',
 'CA',
 'TA',
 'GA',
 'AC',
 'CC',
 'TC',
 'GC',
 'AT',
 'CT',
 'TT',
 'GT',
 'AG',
 'CG',
 'TG',
 'GG']
In [4]:
generate_dna_kmers(3)
Out[4]:
['AAA',
 'CAA',
 'TAA',
 'GAA',
 'ACA',
 'CCA',
 'TCA',
 'GCA',
 'ATA',
 'CTA',
 'TTA',
 'GTA',
 'AGA',
 'CGA',
 'TGA',
 'GGA',
 'AAC',
 'CAC',
 'TAC',
 'GAC',
 'ACC',
 'CCC',
 'TCC',
 'GCC',
 'ATC',
 'CTC',
 'TTC',
 'GTC',
 'AGC',
 'CGC',
 'TGC',
 'GGC',
 'AAT',
 'CAT',
 'TAT',
 'GAT',
 'ACT',
 'CCT',
 'TCT',
 'GCT',
 'ATT',
 'CTT',
 'TTT',
 'GTT',
 'AGT',
 'CGT',
 'TGT',
 'GGT',
 'AAG',
 'CAG',
 'TAG',
 'GAG',
 'ACG',
 'CCG',
 'TCG',
 'GCG',
 'ATG',
 'CTG',
 'TTG',
 'GTG',
 'AGG',
 'CGG',
 'TGG',
 'GGG']
In [5]:
def count_mer(mer, seq):
    '''
    Counts the number of times a substring mer
    ocurrs in the sequence seq (including overlapping
    occurrences)

    sample use: count_mer("GGG", "AGGGCGGG") => 2
    '''

    k = len(mer)
    count = 0
    for i in range(0, len(seq)-k+1):
        if mer == seq[i:i+k]:
            count = count + 1
    return count
In [6]:
count_mer("GGG", "AGGGCGGG")
Out[6]:
2
In [7]:
def kmer_count(k, seq):
    '''
    Return a list of the number of times each possible k-mer appears
    in seq, including overlapping occurrences.
    '''

    mers = generate_dna_kmers(k)
    rv = []
    for m in mers:
        cnt = count_mer(m, seq)
        if cnt != 0:
            rv.append((m, cnt))
    return rv
In [8]:
kmer_count(2, "AAGAGT")
Out[8]:
[('AA', 1), ('GA', 1), ('GT', 1), ('AG', 2)]
In [9]:
kmer_count(3, "AAGAGT")
Out[9]:
[('AGA', 1), ('AGT', 1), ('AAG', 1), ('GAG', 1)]
In [10]:
seq1 = "AAGGCCTT"
seq2 = "AAAGGGCCCTTT"
seq3 = "ACTGTGCAAACCTTGGGGTTCCAA"
seq4 = "AAGGAAGGCGCGCCGGAAAGAG"
seq5 = "GGGGAGGG"
In [11]:
kmer_count(1, seq4)
Out[11]:
[('A', 8), ('C', 4), ('G', 10)]
In [12]:
kmer_count(2, seq4)
Out[12]:
[('AA', 4), ('GA', 3), ('CC', 1), ('GC', 3), ('AG', 4), ('CG', 3), ('GG', 3)]
In [13]:
kmer_count(3, seq4)
Out[13]:
[('AAA', 1),
 ('GAA', 2),
 ('AGA', 1),
 ('GGA', 2),
 ('GCC', 1),
 ('CGC', 2),
 ('GGC', 1),
 ('AAG', 3),
 ('GAG', 1),
 ('CCG', 1),
 ('GCG', 2),
 ('AGG', 2),
 ('CGG', 1)]
In [14]:
from dnafile import read

seq_drosofila = read("drosofila-small.txt")
In [15]:
len(seq_drosofila)
Out[15]:
951900
In [16]:
kmer_count(1, seq_drosofila)
Out[16]:
[('A', 278920), ('C', 196552), ('T', 277040), ('G', 199388)]
In [17]:
kmer_count(2, seq_drosofila)
Out[17]:
[('AA', 96382),
 ('CA', 61129),
 ('TA', 66451),
 ('GA', 54957),
 ('AC', 52022),
 ('CC', 41883),
 ('TC', 54715),
 ('GC', 47932),
 ('AT', 75767),
 ('CT', 54085),
 ('TT', 94270),
 ('GT', 52918),
 ('AG', 54749),
 ('CG', 39455),
 ('TG', 61603),
 ('GG', 43581)]
In [18]:
%time kmers = kmer_count(1, seq_drosofila)
CPU times: user 404 ms, sys: 0 ns, total: 404 ms
Wall time: 404 ms
In [19]:
%time kmers = kmer_count(2, seq_drosofila)
CPU times: user 1.8 s, sys: 0 ns, total: 1.8 s
Wall time: 1.8 s
In [20]:
%time kmers = kmer_count(3, seq_drosofila)
CPU times: user 7 s, sys: 0 ns, total: 7 s
Wall time: 7 s
In [21]:
%time kmers = kmer_count(4, seq_drosofila)
CPU times: user 29.3 s, sys: 4 ms, total: 29.4 s
Wall time: 29.4 s

Counting k-mers revisited

In [22]:
def kmer_count_better(k, seq):
    '''
    Return a list of the number of times each possible k-mer appears
    in seq, including overlapping occurrences.
    '''

    rv = {}
    for i in range(0, len(seq)-k+1):
        subseq = seq[i:i+k]
        v = rv.get(subseq, 0)
        rv[subseq] = v + 1
    return list(rv.items())
In [23]:
seq4
Out[23]:
'AAGGAAGGCGCGCCGGAAAGAG'
In [24]:
seq4[3:3+4]
Out[24]:
'GAAG'
In [25]:
kmer_count_better(1, seq4)
Out[25]:
[('A', 8), ('C', 4), ('G', 10)]
In [26]:
kmer_count_better(2, seq4)
Out[26]:
[('GC', 3), ('AA', 4), ('AG', 4), ('CC', 1), ('CG', 3), ('GG', 3), ('GA', 3)]
In [27]:
kmer_count_better(3, seq4)
Out[27]:
[('AAG', 3),
 ('GAA', 2),
 ('CCG', 1),
 ('GGA', 2),
 ('AGA', 1),
 ('GGC', 1),
 ('GCG', 2),
 ('GCC', 1),
 ('GAG', 1),
 ('AAA', 1),
 ('AGG', 2),
 ('CGC', 2),
 ('CGG', 1)]
In [28]:
%timeit kmers = kmer_count_better(1, seq_drosofila)
183 ms ± 439 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
In [29]:
%timeit kmers = kmer_count_better(2, seq_drosofila)
228 ms ± 2.31 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [30]:
%timeit kmers = kmer_count_better(3, seq_drosofila)
235 ms ± 1.98 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [31]:
%timeit kmers = kmer_count_better(4, seq_drosofila)
242 ms ± 4.94 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [32]:
%timeit kmers = kmer_count_better(100, seq_drosofila)
615 ms ± 13.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [33]:
%timeit kmers = kmer_count_better(500, seq_drosofila)
875 ms ± 4.26 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [34]:
%timeit kmers = kmer_count_better(1000, seq_drosofila)
1.26 s ± 65.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)