在图论中,计算节点的中心性通常涉及到求解邻接矩阵的主要特征向量。每个节点被赋予该特征向量的分量值作为中心性得分。在网页链接的图中,这些值被称为PageRank得分,由谷歌提出。本文的目标是分析维基百科文章内部的链接图,根据这种特征向量中心性来对文章进行相对重要性的排名。
传统的计算主要特征向量的方法是使用幂迭代法。但在本例中,使用Martinssson的随机SVD算法来完成计算,该算法在scikit-learn库中实现。图数据从DBpedia转储中获取,DBpedia是维基百科内容中潜在结构化数据的提取。
以下是使用Python和scikit-learn库来下载数据、解析重定向、构建邻接矩阵、计算主奇异向量以及计算中心性得分的步骤。
首先,需要下载维基百科的重定向和页面链接数据。如果本地磁盘上没有这些数据,将从DBpedia下载它们。
resources = [
("http://downloads.dbpedia.org/3.5.1/en/redirects_en.nt.bz2", "redirects_en.nt.bz2"),
("http://downloads.dbpedia.org/3.5.1/en/page_links_en.nt.bz2", "page_links_en.nt.bz2"),
]
for url, filename in resources:
if not os.path.exists(filename):
print("Downloading data from '%s', please wait..." % url)
opener = urllib.request.urlopen(url)
with open(filename, "wb") as f:
f.write(opener.read())
print()
接下来,需要解析重定向文件,构建一个映射,以便在后续步骤中使用。
def index(redirects, index_map, k):
""“在解析重定向后找到文章名称的索引”""
k = redirects.get(k, k)
return index_map.setdefault(k, len(index_map))
DBPEDIA_RESOURCE_PREFIX_LEN = len("http://dbpedia.org/resource/")
SHORTNAME_SLICE = slice(DBPEDIA_RESOURCE_PREFIX_LEN + 1, -1)
def short_name(nt_uri):
""“移除<和>URI标记和公共URI前缀”""
return nt_uri[SHORTNAME_SLICE]
def get_redirects(redirects_filename):
""“解析重定向并构建一个传递闭包映射”""
redirects = {}
print("Parsing the NT redirect file")
for l, line in enumerate(BZ2File(redirects_filename)):
split = line.split()
if len(split) != 4:
print("ignoring malformed line: " + line)
continue
redirects[short_name(split[0])] = short_name(split[2])
if l % 1000000 == 0:
print("[{}] line: {:08d}".format(datetime.now().isoformat(), l))
# 计算传递闭包
print("Computing the transitive closure of the redirect relation")
for l, source in enumerate(redirects.keys()):
transitive_target = None
target = redirects[source]
seen = {source}
while True:
transitive_target = target
target = redirects.get(target)
if target is None or target in seen:
break
seen.add(target)
redirects[source] = transitive_target
if l % 1000000 == 0:
print("[{}] line: {:08d}".format(datetime.now().isoformat(), l))
return redirects
现在,将提取邻接图作为scipy稀疏矩阵,首先解析重定向。
def get_adjacency_matrix(redirects_filename, page_links_filename, limit=None):
""“提取邻接图作为scipy稀疏矩阵,首先解析重定向”""
print("Computing the redirect map")
redirects = get_redirects(redirects_filename)
print("Computing the integer index map")
index_map = dict()
links = list()
for l, line in enumerate(BZ2File(page_links_filename)):
split = line.split()
if len(split) != 4:
print("ignoring malformed line: " + line)
continue
i = index(redirects, index_map, short_name(split[0]))
j = index(redirects, index_map, short_name(split[2]))
links.append((i, j))
if l % 1000000 == 0:
print("[{}] line: {:08d}".format(datetime.now().isoformat(), l))
if limit is not None and l >= limit - 1:
break
print("Computing the adjacency matrix")
X = sparse.lil_matrix((len(index_map), len(index_map)), dtype=np.float32)
for i, j in links:
X[i, j] = 1.0
del links
print("Converting to CSR representation")
X = X.tocsr()
print("CSR conversion done")
return X, redirects, index_map
# 限制在500万条链接以内,以便在RAM中工作
X, redirects, index_map = get_adjacency_matrix(redirects_filename, page_links_filename, limit=5000000)
names = {i: name for name, i in index_map.items()}
使用随机SVD计算主奇异向量。
print("Computing the principal singular vectors using randomized_svd")
t0 = time()
U, s, V = randomized_svd(X, 5, n_iter=3)
print("done in %0.3f s" % (time() - t0))
# 打印与主奇异向量最相关的维基百科页面的名称,这些页面应该与最高的特征向量相似
print("Top wikipedia pages according to principal singular vectors")
pprint([names[i] for i in np.abs(U.T[0]).argsort()[-10:]])
pprint([names[i] for i in np.abs(V[0]).argsort()[-10:]])
最后,使用幂迭代方法计算中心性得分,这种方法也被称为谷歌PageRank。
def centrality_scores(X, alpha=0.85, max_iter=100, tol=1e-10):
""“使用幂迭代方法计算主要特征向量,也称为谷歌PageRank”""
n = X.shape[0]
X = X.copy()
incoming_counts = np.asarray(X.sum(axis=1)).ravel()
print("Normalizing the graph")
for i in incoming_counts.nonzero()[0]:
X.data[X.indptr[i]:X.indptr[i+1]] *= 1.0 / incoming_counts[i]
dangle = np.asarray(np.where(np.isclose(X.sum(axis=1), 0), 1.0 / n, 0)).ravel()
scores = np.full(n, 1.0 / n, dtype=np.float32) # 初始猜测
for i in range(max_iter):
print("power iteration #%d" % i)
prev_scores = scores
scores = (alpha * (scores * X + np.dot(dangle, prev_scores)) + (1 - alpha) * scores.sum() / n)
# 检查收敛性:归一化l_inf范数
scores_max = np.abs(scores).max()
if scores_max == 0.0:
scores_max = 1.0
err = np.abs(scores - prev_scores).max() / scores_max
print("error: %0.6f" % err)
if err < n * tol:
return scores
return scores
print("Computing principal eigenvector score using a power iteration method")
t0 = time()
scores = centrality_scores(X, max_iter=100)
print("done in %0.3f s" % (time() - t0))
pprint([names[i] for i in np.abs(scores).argsort()[-10:]])