Wikipedia article on PageRank

Original PageRank paper: Page, Lawrence and Brin, Sergey and Motwani, Rajeev and Winograd, Terry (1999) The PageRank Citation Ranking: Bringing Order to the Web. Technical Report. Stanford InfoLab.

Loading the data

In [2]:
from pagerank import read_graph

counts, out_degree = read_graph("tiny.txt")
In [3]:
counts
Out[3]:
[[0, 1, 0, 0, 0],
 [0, 0, 2, 2, 1],
 [0, 0, 0, 1, 0],
 [1, 0, 0, 0, 0],
 [1, 0, 1, 0, 0]]
In [4]:
out_degree
Out[4]:
[1, 5, 1, 1, 2]

The pseudocode for the rest of the code is:

transition_matrix = <compute transition matrix>
times_visited = <list of size N>
for i in range(num_steps):
    page = <take a step with the random surfer>
    times_visited[page] += 1

for c in times_visited:
    print c / num_steps

Computing the transition matrix

In [5]:
def compute_transition(counts, out_degree):
    n = len(out_degree)

    # Create an empty n x n "matrix" (using lists of lists)
    transition_matrix = []
    for i in range(n):
        transition_matrix.append([0]*n)   

    for i in range(n):
        for j in range(n):
            transition_matrix[i][j] = 0.0 # Instead of 0.0 we need to compute a probability

    return transition_matrix
In [6]:
compute_transition(counts, out_degree)
Out[6]:
[[0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0]]
In [7]:
n = 5
i = 1
j = 2

# 90% of the times, the user goes to page "j" with a probability
# proportional to the number of links from "i" to "j".
# 40% of the outbound links in page 1 go to page 2
p_followlink = 0.9 * counts[i][j]/out_degree[i]   # i.e., 0.9 * 2/5 = 0.36

# 10% of the times, we just randomly pick any page
p_any = 0.1 * (1/n)

# The total probability is the sum of the two
p = p_followlink + p_any
In [8]:
p
Out[8]:
0.38
In [9]:
def compute_transition(counts, out_degree):
    n = len(out_degree)

    # Create an empty n x n "matrix" (using lists of lists)
    transition_matrix = []
    for i in range(n):
        transition_matrix.append([0]*n)   

    for i in range(n):
        for j in range(n):
            transition_matrix[i][j] = (0.9 * counts[i][j]/out_degree[i]) + (0.1 * 1/n)

    return transition_matrix
In [10]:
compute_transition(counts, out_degree)
Out[10]:
[[0.02, 0.92, 0.02, 0.02, 0.02],
 [0.02, 0.02, 0.38, 0.38, 0.19999999999999998],
 [0.02, 0.02, 0.02, 0.92, 0.02],
 [0.92, 0.02, 0.02, 0.02, 0.02],
 [0.47000000000000003, 0.02, 0.47000000000000003, 0.02, 0.02]]

Simulating the random surfer

In [11]:
num_steps = 1000

transition_matrix = compute_transition(counts, out_degree)
n = len(transition_matrix)
times_visited = [0]*n

page = 0
for t in range(num_steps):
    page = 0 # Instead of 0, we need to figure out the result of a single step by the random surfer
    times_visited[page] += 1
In [12]:
print(times_visited)
[1000, 0, 0, 0, 0]
In [13]:
transition_matrix[4]
Out[13]:
[0.47000000000000003, 0.02, 0.47000000000000003, 0.02, 0.02]
In [14]:
probabilities = transition_matrix[4]

# Let's hardcode 0.7. This should choose page 2
r = 0.7
psum = 0.0

for j in range(n):
    psum += probabilities[j]
    if psum >= r:
        print(j)
        break
2
In [15]:
import random

probabilities = transition_matrix[4]

r = random.uniform(0.0, 1.0)
psum = 0.0

for j in range(n):
    psum += probabilities[j]
    if psum >= r:
        print(j)
        break
2
In [16]:
def make_one_move(transition_matrix, page):
    n = len(transition_matrix)
    r = random.uniform(0.0, 1.0)
    psum = 0.0

    for j in range(n):
        psum += transition_matrix[page][j]
        if psum >= r:
            return j
In [17]:
probs = []
for i in range(1000):
    probs.append(make_one_move(transition_matrix, 4))
In [18]:
for i in range(n):
    print(i, probs.count(i))
0 481
1 19
2 465
3 17
4 18
In [21]:
num_steps = 1000

transition_matrix = compute_transition(counts, out_degree)
n = len(transition_matrix)
times_visited = [0]*n

page = 0
for t in range(num_steps):
    page = make_one_move(transition_matrix, page)
    times_visited[page] += 1
In [22]:
for i, count in enumerate(times_visited):
    print(i, count)
0 272
1 271
2 143
3 245
4 69
In [23]:
def random_surfer(transition_matrix, num_steps):
    n = len(transition_matrix)
    page = 0
    times_visited = [0]*n

    for t in range(num_steps):
        page = make_one_move(transition_matrix, page)
        times_visited[page] += 1

    for i, count in enumerate(times_visited):
        v = count / num_steps
        print(i, v)
In [29]:
random_surfer(transition_matrix, 1000)
0 0.277
1 0.261
2 0.139
3 0.25
4 0.073

All together

In [30]:
from pagerank import read_graph

counts, out_degree = read_graph("tiny.txt")
transition_matrix = compute_transition(counts, out_degree)
random_surfer(transition_matrix, 1000)
0 0.267
1 0.258
2 0.147
3 0.245
4 0.083
In [34]:
counts, out_degree = read_graph("medium.txt")
transition_matrix = compute_transition(counts, out_degree)
random_surfer(transition_matrix, 1000)
0 0.004
1 0.015
2 0.006
3 0.004
4 0.002
5 0.017
6 0.058
7 0.033
8 0.016
9 0.042
10 0.002
11 0.028
12 0.017
13 0.036
14 0.028
15 0.017
16 0.017
17 0.0
18 0.028
19 0.02
20 0.001
21 0.035
22 0.052
23 0.037
24 0.008
25 0.004
26 0.016
27 0.009
28 0.043
29 0.003
30 0.035
31 0.025
32 0.021
33 0.012
34 0.02
35 0.02
36 0.005
37 0.027
38 0.009
39 0.027
40 0.007
41 0.037
42 0.024
43 0.014
44 0.036
45 0.004
46 0.018
47 0.018
48 0.032
49 0.011