Original PageRank paper: Page, Lawrence and Brin, Sergey and Motwani, Rajeev and Winograd, Terry (1999) The PageRank Citation Ranking: Bringing Order to the Web. Technical Report. Stanford InfoLab.

Loading the data¶

from pagerank import read_graph

counts, out_degree = read_graph("tiny.txt")

counts

[[0, 1, 0, 0, 0],
 [0, 0, 2, 2, 1],
 [0, 0, 0, 1, 0],
 [1, 0, 0, 0, 0],
 [1, 0, 1, 0, 0]]

out_degree

[1, 5, 1, 1, 2]

The pseudocode for the rest of the code is:

transition_matrix = <compute transition matrix>
times_visited = <list of size N>
for i in range(num_steps):
    page = <take a step with the random surfer>
    times_visited[page] += 1

for c in times_visited:
    print c / num_steps

Computing the transition matrix¶

def compute_transition(counts, out_degree):
    n = len(out_degree)

    # Create an empty n x n "matrix" (using lists of lists)
    transition_matrix = []
    for i in range(n):
        transition_matrix.append([0]*n)   

    for i in range(n):
        for j in range(n):
            transition_matrix[i][j] = 0.0 # Instead of 0.0 we need to compute a probability

    return transition_matrix

compute_transition(counts, out_degree)

[[0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0],
 [0.0, 0.0, 0.0, 0.0, 0.0]]

n = 5
i = 1
j = 2

# 90% of the times, the user goes to page "j" with a probability
# proportional to the number of links from "i" to "j".
# 40% of the outbound links in page 1 go to page 2
p_followlink = 0.9 * counts[i][j]/out_degree[i]   # i.e., 0.9 * 2/5 = 0.36

# 10% of the times, we just randomly pick any page
p_any = 0.1 * (1/n)

# The total probability is the sum of the two
p = p_followlink + p_any

p

0.38

def compute_transition(counts, out_degree):
    n = len(out_degree)

    # Create an empty n x n "matrix" (using lists of lists)
    transition_matrix = []
    for i in range(n):
        transition_matrix.append([0]*n)   

    for i in range(n):
        for j in range(n):
            transition_matrix[i][j] = (0.9 * counts[i][j]/out_degree[i]) + (0.1 * 1/n)

    return transition_matrix

compute_transition(counts, out_degree)

[[0.02, 0.92, 0.02, 0.02, 0.02],
 [0.02, 0.02, 0.38, 0.38, 0.19999999999999998],
 [0.02, 0.02, 0.02, 0.92, 0.02],
 [0.92, 0.02, 0.02, 0.02, 0.02],
 [0.47000000000000003, 0.02, 0.47000000000000003, 0.02, 0.02]]

Simulating the random surfer¶

num_steps = 1000

transition_matrix = compute_transition(counts, out_degree)
n = len(transition_matrix)
times_visited = [0]*n

page = 0
for t in range(num_steps):
    page = 0 # Instead of 0, we need to figure out the result of a single step by the random surfer
    times_visited[page] += 1

print(times_visited)

[1000, 0, 0, 0, 0]

transition_matrix[4]

[0.47000000000000003, 0.02, 0.47000000000000003, 0.02, 0.02]

probabilities = transition_matrix[4]

# Let's hardcode 0.7. This should choose page 2
r = 0.7
psum = 0.0

for j in range(n):
    psum += probabilities[j]
    if psum >= r:
        print(j)
        break

2

import random

probabilities = transition_matrix[4]

r = random.uniform(0.0, 1.0)
psum = 0.0

for j in range(n):
    psum += probabilities[j]
    if psum >= r:
        print(j)
        break

0

def make_one_move(transition_matrix, page):
    n = len(transition_matrix)
    r = random.uniform(0.0, 1.0)
    psum = 0.0

    for j in range(n):
        psum += transition_matrix[page][j]
        if psum >= r:
            return j

    # If we get to the end of the loop without returning, we return
    # the last value.
    return n

probs = []
for i in range(1000):
    probs.append(make_one_move(transition_matrix, 4))

for i in range(n):
    print(i, probs.count(i))

0 473
1 16
2 471
3 18
4 22

num_steps = 1000

transition_matrix = compute_transition(times_visited, out_degree)
n = len(transition_matrix)
times_visited = [0]*n

page = 0
for t in range(num_steps):
    page = make_one_move(transition_matrix, page)
    times_visited[page] += 1

for i, count in enumerate(times_visited):
    print(i, count)

0 274
1 266
2 162
3 238
4 60

def random_surfer(transition_matrix, num_steps):
    n = len(transition_matrix)
    page = 0
    times_visited = [0]*n

    for t in range(num_steps):
        page = make_one_move(transition_matrix, page)
        times_visited[page] += 1

    for i, count in enumerate(times_visited):
        v = count / num_steps
        print(i, v)

random_surfer(transition_matrix, 1000)

0 0.268
1 0.264
2 0.161
3 0.236
4 0.071

All together¶

from pagerank import read_graph

counts, out_degree = read_graph("tiny.txt")
transition_matrix = compute_transition(counts, out_degree)
random_surfer(transition_matrix, 1000)

0 0.279
1 0.263
2 0.144
3 0.257
4 0.057

counts, out_degree = read_graph("medium.txt")
transition_matrix = compute_transition(counts, out_degree)
random_surfer(transition_matrix, 1000)

0 0.001
1 0.017
2 0.008
3 0.001
4 0.005
5 0.021
6 0.065
7 0.014
8 0.01
9 0.052
10 0.004
11 0.037
12 0.003
13 0.051
14 0.011
15 0.024
16 0.03
17 0.004
18 0.016
19 0.021
20 0.004
21 0.027
22 0.064
23 0.046
24 0.011
25 0.005
26 0.005
27 0.007
28 0.03
29 0.0
30 0.051
31 0.014
32 0.018
33 0.014
34 0.019
35 0.03
36 0.003
37 0.047
38 0.018
39 0.021
40 0.013
41 0.014
42 0.015
43 0.022
44 0.015
45 0.007
46 0.029
47 0.03
48 0.01
49 0.016