Counting k-mers

In [9]:
def generate_dna_kmers(k):
  '''
  Return a list of all possible substrings of
  length k using only characters A, C, T, and G
  '''
  bases = ["A", "C", "T", "G"]

  last = bases
  current = []
  for i in range(k-1):
    for b in bases:
      for l in last:
        current.append(l+b)
    last = current
    current= []
  return last
In [10]:
generate_dna_kmers(1)
Out[10]:
['A', 'C', 'T', 'G']
In [11]:
generate_dna_kmers(2)
Out[11]:
['AA',
 'CA',
 'TA',
 'GA',
 'AC',
 'CC',
 'TC',
 'GC',
 'AT',
 'CT',
 'TT',
 'GT',
 'AG',
 'CG',
 'TG',
 'GG']
In [12]:
generate_dna_kmers(3)
Out[12]:
['AAA',
 'CAA',
 'TAA',
 'GAA',
 'ACA',
 'CCA',
 'TCA',
 'GCA',
 'ATA',
 'CTA',
 'TTA',
 'GTA',
 'AGA',
 'CGA',
 'TGA',
 'GGA',
 'AAC',
 'CAC',
 'TAC',
 'GAC',
 'ACC',
 'CCC',
 'TCC',
 'GCC',
 'ATC',
 'CTC',
 'TTC',
 'GTC',
 'AGC',
 'CGC',
 'TGC',
 'GGC',
 'AAT',
 'CAT',
 'TAT',
 'GAT',
 'ACT',
 'CCT',
 'TCT',
 'GCT',
 'ATT',
 'CTT',
 'TTT',
 'GTT',
 'AGT',
 'CGT',
 'TGT',
 'GGT',
 'AAG',
 'CAG',
 'TAG',
 'GAG',
 'ACG',
 'CCG',
 'TCG',
 'GCG',
 'ATG',
 'CTG',
 'TTG',
 'GTG',
 'AGG',
 'CGG',
 'TGG',
 'GGG']
In [13]:
def count_mer(mer, seq):
  '''
  Counts the number of times a substring mer
  ocurrs in the sequence seq (including overlapping
  occurrences)

  sample use: count_mer("GGG", "AGGGCGGG") => 2
  '''

  k = len(mer)
  count = 0
  for i in range(0, len(seq)-k+1):
    if mer == seq[i:i+k]:
       count = count + 1
  return count
In [14]:
count_mer("GGG", "AGGGCGGG")
Out[14]:
2
In [15]:
def kmer_count(k, seq):
  '''
  Return a list of the number of times each possible k-mer appears
  in seq, including overlapping occurrences.
  '''

  mers = generate_dna_kmers(k)
  rv = []
  for m in mers:
    cnt = count_mer(m, seq)
    if cnt != 0:
      rv.append((m, cnt))
  return rv
In [17]:
kmer_count(2, "AAGAGT")
Out[17]:
[('AA', 1), ('GA', 1), ('GT', 1), ('AG', 2)]
In [18]:
kmer_count(3, "AAGAGT")
Out[18]:
[('AGA', 1), ('AGT', 1), ('AAG', 1), ('GAG', 1)]
In [20]:
seq1 = "AAGGCCTT"
seq2 = "AAAGGGCCCTTT"
seq3 = "ACTGTGCAAACCTTGGGGTTCCAA"
seq4 = "AAGGAAGGCGCGCCGGAAAGAG"
seq5 = "GGGGAGGG"
In [42]:
kmer_count(1, seq4)
Out[42]:
[('A', 8), ('C', 4), ('G', 10)]
In [43]:
kmer_count(2, seq4)
Out[43]:
[('AA', 4), ('GA', 3), ('CC', 1), ('GC', 3), ('AG', 4), ('CG', 3), ('GG', 3)]
In [44]:
kmer_count(3, seq4)
Out[44]:
[('AAA', 1),
 ('GAA', 2),
 ('AGA', 1),
 ('GGA', 2),
 ('GCC', 1),
 ('CGC', 2),
 ('GGC', 1),
 ('AAG', 3),
 ('GAG', 1),
 ('CCG', 1),
 ('GCG', 2),
 ('AGG', 2),
 ('CGG', 1)]
In [25]:
from dnafile import read

seq_drosofila = read("drosofila-small.txt")
In [27]:
len(seq_drosofila)
Out[27]:
951900
In [29]:
kmer_count(1, seq_drosofila)
Out[29]:
[('A', 278920), ('C', 196552), ('T', 277040), ('G', 199388)]
In [31]:
kmer_count(2, seq_drosofila)
Out[31]:
[('AA', 96382),
 ('CA', 61129),
 ('TA', 66451),
 ('GA', 54957),
 ('AC', 52022),
 ('CC', 41883),
 ('TC', 54715),
 ('GC', 47932),
 ('AT', 75767),
 ('CT', 54085),
 ('TT', 94270),
 ('GT', 52918),
 ('AG', 54749),
 ('CG', 39455),
 ('TG', 61603),
 ('GG', 43581)]
In [33]:
%time kmers = kmer_count(1, seq_drosofila)
CPU times: user 304 ms, sys: 4.01 ms, total: 308 ms
Wall time: 308 ms
In [35]:
%time kmers = kmer_count(2, seq_drosofila)
CPU times: user 1.4 s, sys: 0 ns, total: 1.4 s
Wall time: 1.39 s
In [36]:
%time kmers = kmer_count(3, seq_drosofila)
CPU times: user 5.51 s, sys: 0 ns, total: 5.51 s
Wall time: 5.5 s
In [37]:
%time kmers = kmer_count(4, seq_drosofila)
CPU times: user 22 s, sys: 0 ns, total: 22 s
Wall time: 21.9 s

Counting k-mers revisited

In [53]:
def kmer_count_better(k, seq):
  '''
  Return a list of the number of times each possible k-mer appears
  in seq, including overlapping occurrences.
  '''

  rv = {}
  for i in range(0, len(seq)-k+1):
    subseq = seq[i:i+k]
    v = rv.get(subseq, 0)
    rv[subseq] = v + 1
  return list(rv.items())
In [54]:
seq4
Out[54]:
'AAGGAAGGCGCGCCGGAAAGAG'
In [55]:
seq4[3:3+4]
Out[55]:
'GAAG'
In [56]:
kmer_count_better(1, seq4)
Out[56]:
[('C', 4), ('A', 8), ('G', 10)]
In [57]:
kmer_count_better(2, seq4)
Out[57]:
[('GC', 3), ('AA', 4), ('GA', 3), ('CC', 1), ('AG', 4), ('GG', 3), ('CG', 3)]
In [58]:
kmer_count_better(3, seq4)
Out[58]:
[('GGA', 2),
 ('AGA', 1),
 ('CGC', 2),
 ('GCG', 2),
 ('GAG', 1),
 ('GGC', 1),
 ('GCC', 1),
 ('AGG', 2),
 ('GAA', 2),
 ('AAG', 3),
 ('CCG', 1),
 ('AAA', 1),
 ('CGG', 1)]
In [70]:
%timeit kmers = kmer_count_better(1, seq_drosofila)
10 loops, best of 3: 141 ms per loop
In [73]:
%timeit kmers = kmer_count_better(2, seq_drosofila)
10 loops, best of 3: 182 ms per loop
In [72]:
%timeit kmers = kmer_count_better(3, seq_drosofila)
10 loops, best of 3: 187 ms per loop
In [77]:
%timeit kmers = kmer_count_better(4, seq_drosofila)
1 loops, best of 3: 188 ms per loop
In [106]:
%timeit kmers = kmer_count_better(100, seq_drosofila)
1 loops, best of 3: 512 ms per loop
In [79]:
%timeit kmers = kmer_count_better(500, seq_drosofila)
1 loops, best of 3: 661 ms per loop
In [80]:
%timeit kmers = kmer_count_better(1000, seq_drosofila)
1 loops, best of 3: 955 ms per loop