Wikipedia article on PageRank

Original PageRank paper: Page, Lawrence and Brin, Sergey and Motwani, Rajeev and Winograd, Terry (1999) The PageRank Citation Ranking: Bringing Order to the Web. Technical Report. Stanford InfoLab.

Loading the data

In [43]:
from pagerank import read_graph

counts, out_degree = read_graph("tiny.txt")
In [44]:
counts
Out[44]:
[[0, 1, 0, 0, 0],
 [0, 0, 2, 2, 1],
 [0, 0, 0, 1, 0],
 [1, 0, 0, 0, 0],
 [1, 0, 1, 0, 0]]
In [45]:
out_degree
Out[45]:
[1, 5, 1, 1, 2]

The pseudocode for the rest of the code is:

transition_matrix = <compute transition matrix>
times_visited = <list of size N>
for i in range(num_steps):
    page = <take a step with the random surfer>
    times_visited[page] += 1

for c in times_visited:
    print c / num_steps

Computing the transition matrix

In [46]:
def compute_transition(counts, out_degree):
    n = len(out_degree)

    # Create an empty n x n "matrix" (using lists of lists)
    transition_matrix = []
    for i in range(n):
        transition_matrix.append([0]*n)   

    for i in range(n):
        for j in range(n):
            transition_matrix[i][j] = 0.0 # Instead of 0.0 we need to compute a probability

    return transition_matrix
In [47]:
compute_transition(counts, out_degree)
Out[47]:
[[0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0]]
In [48]:
n = 5
i = 1
j = 2

# 90% of the times, the user goes to page "j" with a probability
# proportional to the number of links from "i" to "j".
# 40% of the outbound links in page 1 go to page 2
p_followlink = 0.9 * counts[i][j]/out_degree[i]   # i.e., 0.9 * 2/5 = 0.36

# 10% of the times, we just randomly pick any page
p_any = 0.1 * (1/n)

# The total probability is the sum of the two
p = p_followlink + p_any
In [49]:
p
Out[49]:
0.38
In [50]:
def compute_transition(counts, out_degree):
    n = len(out_degree)

    # Create an empty n x n "matrix" (using lists of lists)
    transition_matrix = []
    for i in range(n):
        transition_matrix.append([0]*n)   

    for i in range(n):
        for j in range(n):
            transition_matrix[i][j] = (0.9 * counts[i][j]/out_degree[i]) + (0.1 * 1/n)

    return transition_matrix
In [51]:
compute_transition(counts, out_degree)
Out[51]:
[[0.02, 0.92, 0.02, 0.02, 0.02],
 [0.02, 0.02, 0.38, 0.38, 0.19999999999999998],
 [0.02, 0.02, 0.02, 0.92, 0.02],
 [0.92, 0.02, 0.02, 0.02, 0.02],
 [0.47000000000000003, 0.02, 0.47000000000000003, 0.02, 0.02]]

Simulating the random surfer

In [52]:
num_steps = 1000

transition_matrix = compute_transition(counts, out_degree)
n = len(transition_matrix)
times_visited = [0]*n

page = 0
for t in range(num_steps):
    page = 0 # Instead of 0, we need to figure out the result of a single step by the random surfer
    times_visited[page] += 1
In [53]:
print(times_visited)
[1000, 0, 0, 0, 0]
In [54]:
transition_matrix[4]
Out[54]:
[0.47000000000000003, 0.02, 0.47000000000000003, 0.02, 0.02]
In [55]:
probabilities = transition_matrix[4]

# Let's hardcode 0.7. This should choose page 2
r = 0.7
psum = 0.0

for j in range(n):
    psum += probabilities[j]
    if psum >= r:
        print(j)
        break
2
In [56]:
import random

probabilities = transition_matrix[4]

r = random.uniform(0.0, 1.0)
psum = 0.0

for j in range(n):
    psum += probabilities[j]
    if psum >= r:
        print(j)
        break
0
In [57]:
def make_one_move(transition_matrix, page):
    n = len(transition_matrix)
    r = random.uniform(0.0, 1.0)
    psum = 0.0

    for j in range(n):
        psum += transition_matrix[page][j]
        if psum >= r:
            return j

    # If we get to the end of the loop without returning, we return
    # the last value.
    return n
In [58]:
probs = []
for i in range(1000):
    probs.append(make_one_move(transition_matrix, 4))
In [59]:
for i in range(n):
    print(i, probs.count(i))
0 473
1 16
2 471
3 18
4 22
In [60]:
num_steps = 1000

transition_matrix = compute_transition(times_visited, out_degree)
n = len(transition_matrix)
times_visited = [0]*n

page = 0
for t in range(num_steps):
    page = make_one_move(transition_matrix, page)
    times_visited[page] += 1
In [61]:
for i, count in enumerate(times_visited):
    print(i, count)
0 274
1 266
2 162
3 238
4 60
In [73]:
def random_surfer(transition_matrix, num_steps):
    n = len(transition_matrix)
    page = 0
    times_visited = [0]*n

    for t in range(num_steps):
        page = make_one_move(transition_matrix, page)
        times_visited[page] += 1

    for i, count in enumerate(times_visited):
        v = count / num_steps
        print(i, v)
In [74]:
random_surfer(transition_matrix, 1000)
0 0.268
1 0.264
2 0.161
3 0.236
4 0.071

All together

In [76]:
from pagerank import read_graph

counts, out_degree = read_graph("tiny.txt")
transition_matrix = compute_transition(counts, out_degree)
random_surfer(transition_matrix, 1000)
0 0.279
1 0.263
2 0.144
3 0.257
4 0.057

In [77]:
counts, out_degree = read_graph("medium.txt")
transition_matrix = compute_transition(counts, out_degree)
random_surfer(transition_matrix, 1000)
0 0.001
1 0.017
2 0.008
3 0.001
4 0.005
5 0.021
6 0.065
7 0.014
8 0.01
9 0.052
10 0.004
11 0.037
12 0.003
13 0.051
14 0.011
15 0.024
16 0.03
17 0.004
18 0.016
19 0.021
20 0.004
21 0.027
22 0.064
23 0.046
24 0.011
25 0.005
26 0.005
27 0.007
28 0.03
29 0.0
30 0.051
31 0.014
32 0.018
33 0.014
34 0.019
35 0.03
36 0.003
37 0.047
38 0.018
39 0.021
40 0.013
41 0.014
42 0.015
43 0.022
44 0.015
45 0.007
46 0.029
47 0.03
48 0.01
49 0.016