Original PageRank paper: Page, Lawrence and Brin, Sergey and Motwani, Rajeev and Winograd, Terry (1999) The PageRank Citation Ranking: Bringing Order to the Web. Technical Report. Stanford InfoLab.
from pagerank import read_graph
counts, out_degree = read_graph("tiny.txt")
counts
[[0, 1, 0, 0, 0], [0, 0, 2, 2, 1], [0, 0, 0, 1, 0], [1, 0, 0, 0, 0], [1, 0, 1, 0, 0]]
out_degree
[1, 5, 1, 1, 2]
The pseudocode for the rest of the code is:
transition_matrix = <compute transition matrix>
times_visited = <list of size N>
for i in range(num_steps):
page = <take a step with the random surfer>
times_visited[page] += 1
for c in times_visited:
print c / num_steps
def compute_transition(counts, out_degree):
n = len(out_degree)
# Create an empty n x n "matrix" (using lists of lists)
transition_matrix = []
for i in range(n):
transition_matrix.append([0]*n)
for i in range(n):
for j in range(n):
transition_matrix[i][j] = 0.0 # Instead of 0.0 we need to compute a probability
return transition_matrix
compute_transition(counts, out_degree)
[[0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0]]
n = 5
i = 1
j = 2
# 90% of the times, the user goes to page "j" with a probability
# proportional to the number of links from "i" to "j".
# 40% of the outbound links in page 1 go to page 2
p_followlink = 0.9 * counts[i][j]/out_degree[i] # i.e., 0.9 * 2/5 = 0.36
# 10% of the times, we just randomly pick any page
p_any = 0.1 * (1/n)
# The total probability is the sum of the two
p = p_followlink + p_any
p
0.38
def compute_transition(counts, out_degree):
n = len(out_degree)
# Create an empty n x n "matrix" (using lists of lists)
transition_matrix = []
for i in range(n):
transition_matrix.append([0]*n)
for i in range(n):
for j in range(n):
transition_matrix[i][j] = (0.9 * counts[i][j]/out_degree[i]) + (0.1 * 1/n)
return transition_matrix
compute_transition(counts, out_degree)
[[0.02, 0.92, 0.02, 0.02, 0.02], [0.02, 0.02, 0.38, 0.38, 0.19999999999999998], [0.02, 0.02, 0.02, 0.92, 0.02], [0.92, 0.02, 0.02, 0.02, 0.02], [0.47000000000000003, 0.02, 0.47000000000000003, 0.02, 0.02]]
num_steps = 1000
transition_matrix = compute_transition(counts, out_degree)
n = len(transition_matrix)
times_visited = [0]*n
page = 0
for t in range(num_steps):
page = 0 # Instead of 0, we need to figure out the result of a single step by the random surfer
times_visited[page] += 1
print(times_visited)
[1000, 0, 0, 0, 0]
transition_matrix[4]
[0.47000000000000003, 0.02, 0.47000000000000003, 0.02, 0.02]
probabilities = transition_matrix[4]
# Let's hardcode 0.7. This should choose page 2
r = 0.7
psum = 0.0
for j in range(n):
psum += probabilities[j]
if psum >= r:
print(j)
break
2
import random
probabilities = transition_matrix[4]
r = random.uniform(0.0, 1.0)
psum = 0.0
for j in range(n):
psum += probabilities[j]
if psum >= r:
print(j)
break
2
def make_one_move(transition_matrix, page):
n = len(transition_matrix)
r = random.uniform(0.0, 1.0)
psum = 0.0
for j in range(n):
psum += transition_matrix[page][j]
if psum >= r:
return j
probs = []
for i in range(1000):
probs.append(make_one_move(transition_matrix, 4))
for i in range(n):
print(i, probs.count(i))
0 481 1 19 2 465 3 17 4 18
num_steps = 1000
transition_matrix = compute_transition(counts, out_degree)
n = len(transition_matrix)
times_visited = [0]*n
page = 0
for t in range(num_steps):
page = make_one_move(transition_matrix, page)
times_visited[page] += 1
for i, count in enumerate(times_visited):
print(i, count)
0 272 1 271 2 143 3 245 4 69
def random_surfer(transition_matrix, num_steps):
n = len(transition_matrix)
page = 0
times_visited = [0]*n
for t in range(num_steps):
page = make_one_move(transition_matrix, page)
times_visited[page] += 1
for i, count in enumerate(times_visited):
v = count / num_steps
print(i, v)
random_surfer(transition_matrix, 1000)
0 0.277 1 0.261 2 0.139 3 0.25 4 0.073
from pagerank import read_graph
counts, out_degree = read_graph("tiny.txt")
transition_matrix = compute_transition(counts, out_degree)
random_surfer(transition_matrix, 1000)
0 0.267 1 0.258 2 0.147 3 0.245 4 0.083
counts, out_degree = read_graph("medium.txt")
transition_matrix = compute_transition(counts, out_degree)
random_surfer(transition_matrix, 1000)
0 0.004 1 0.015 2 0.006 3 0.004 4 0.002 5 0.017 6 0.058 7 0.033 8 0.016 9 0.042 10 0.002 11 0.028 12 0.017 13 0.036 14 0.028 15 0.017 16 0.017 17 0.0 18 0.028 19 0.02 20 0.001 21 0.035 22 0.052 23 0.037 24 0.008 25 0.004 26 0.016 27 0.009 28 0.043 29 0.003 30 0.035 31 0.025 32 0.021 33 0.012 34 0.02 35 0.02 36 0.005 37 0.027 38 0.009 39 0.027 40 0.007 41 0.037 42 0.024 43 0.014 44 0.036 45 0.004 46 0.018 47 0.018 48 0.032 49 0.011