Pagerank玩具示例无法收敛

我正在编码玩具pagerank，其中也包括爬行者。看起来有些奇怪，因为我的代码无法收敛PR值。我还可以注意，每次迭代之间的三角洲为0，输出的一部分将是：

url: http://en.m.wikipedia.org/wiki/Israel_State_Cup
links_to_node: set(['http://en.m.wikipedia.org/wiki/Association_football', 'http://en.m.wikipedia.org/wiki/Wikipedia:General_disclaimer'])
links_from_node: set(['http://en.m.wikipedia.org/wiki/Israel_State_Cup'])
PR_score: 2.41759524248e+38
ttl_time: 1
last_delta: 0

代码如下：

import requests 
import lxml.html
import random
class pr_node:
        """WDM PR node"""
        url = ""
        links_to_node = set([])
        links_from_node = set([])
        PR_score = 0.0001
        ttl_time = 0
        last_delta = 0
        def __init__(self, url, ttl_time):
            self.url = url
            self.links_to_node = set([])
            self.links_from_node = set([])
            self.PR_score = 0.1
            self.ttl_time = ttl_time
        def print_node_out_links(self):
            print "nn" + self.url + " with ttl " + str(self.ttl_time) + " = "
            s = self.links_to_node
            print "{" + ", ".join(str(e) for e in s) + "}"
        def print_node_pr(self):
            print "nn" + self.url + " PR is: " + str(self.PR_score)
        def print_all(self):
            print "url: " + self.url
            print "links_to_node: " + repr(self.links_to_node)
            print "links_from_node: " + repr(self.links_from_node)
            print "PR_score: " + str(self.PR_score)
            print "ttl_time: " + str(self.ttl_time)
            print "last_delta: " + str(self.last_delta)

def crawl(url, url_ttl):
        """crawl to new url, if ttl == 0 max depth reached, don't visit same url twice"""
        if url_ttl > 0 and (url not in visited_urls):
            # create new node p from parsed page
            print "crawling to " + url + "...n"
            res = requests.get(url)
            doc = lxml.html.fromstring(res.content)
            p = pr_node(url, url_ttl)
            # add new PR node
            global pr_nodes
            pr_nodes[url] = p
            # get all wiki links
            all_links_to_node = set([])
            for t in doc.xpath("//a[contains(@href, '/wiki/')]"):
                add_val = ""
                if not t.attrib['href'].startswith("http://") and t.attrib['href'].startswith("/wiki/"):
                    add_val = "http://en.m.wikipedia.org" + t.attrib['href']
                    all_links_to_node.add(add_val)
                elif t.attrib['href'].startswith("http://"):
                    add_val = t.attrib['href']
                    all_links_to_node.add(add_val)
                else:
                    pass
            # select random 10 of them and crawl to them
            iter_count = 0
            iter_stop_lim = 10
            while iter_count < iter_stop_lim and len(p.links_to_node) < iter_stop_lim and len(all_links_to_node) > 0:
                    current_url = random.sample(all_links_to_node, 1)[0]
                    all_links_to_node.remove(current_url)  # don't do it twice...
                    iter_count = + 1
                    if not (current_url in visited_urls) and url_ttl > 1:
                        p.links_to_node.add(current_url)
                        crawl(current_url, url_ttl - 1)
                        visited_urls.add(url)
                    elif current_url in visited_urls and url_ttl == 1:
                        p.links_to_node.add(current_url)
        else:
            print "max depth reached or you've already been here"
        return

def calc_graph_pr(pr_iter_count, damp_factor):
    "print calculating PageRank"
    current_iter = 0
    global pr_nodes
    g1 = {}
    g2 = {}
    for node in pr_nodes.itervalues():
        g1[node.url] = node
        g2[node.url] = node
    g = [g1, g2]
    while current_iter < pr_iter_count:
        print "PageRank iteration #" + str(current_iter)
        for p in g[current_iter % 2].itervalues():
            in_links_addition = 0
            for l in p.links_to_node:
                l_val = g[(current_iter - 1) % 2][l]
                l_val.delta = l_val.PR_score - g[current_iter % 2][l].PR_score
                in_links_addition += l_val.PR_score/len(l_val.links_from_node)
            p.PR_score = damp_factor + (1 - damp_factor) * in_links_addition
        current_iter += 1
    pr_nodes = g[0] #WLOG could be also g[1]...
    for p in pr_nodes.itervalues():
        p.print_all()
    print "check bool:"
    print g1 == g2
    return

def update_graph_links():
    global pr_nodes
    for node in pr_nodes.itervalues():
        for u in node.links_to_node:
            if u in pr_nodes:
                pr_nodes[u].links_from_node.add(u)
    return
visited_urls = set([])
pr_nodes = {}
glob_pr_iter_count = 50
glob_damp_factor = 0.2
crawl("http://en.m.wikipedia.org/wiki/Nikola_Mitrovic", 3)
update_graph_links()
calc_graph_pr(glob_pr_iter_count, glob_damp_factor)

这是添加函数毁了所有功能。将其修复到：

def update_graph_links():
    """register each node with neighbours pointing at it"""
    global pr_nodes
    for node in pr_nodes.itervalues():
        for u in node.links_to_node:
            if u in pr_nodes:
                pr_nodes[u].links_from_node.add(node.url)
    return

进行了几次调整后，一些重构并添加了适当的注释，它提到了以下代码：

import requests 
import lxml.html
import random
import sys
class pr_node:
        """WDM PR node"""
        url = ""
        links_to_node = set([])
        links_from_node = set([])
        PR_score = 0.01
        ttl_time = 0
        last_delta = 0  # used for debug only
        def __init__(self, url, ttl_time):
            """CTOR"""
            self.url = url
            self.links_to_node = set([])
            self.links_from_node = set([])
            self.PR_score = 0.01
            self.ttl_time = ttl_time

        def print_node_out_links(self):
            """print for q1a"""
            print "nn" + self.url + " with ttl " + str(self.ttl_time) + " = "
            s = self.links_to_node
            print "{" + ", ".join(str(e) for e in s) + "}"

        def print_node_pr(self):
            """print for q1b"""
            print "nn" + self.url + " PR is: " + str(self.PR_score)

        def print_all(self):
            """print for q1b and debug"""
            print "url: " + self.url
            print "links_to_node: " + repr(self.links_to_node)
            print "links_from_node: " + repr(self.links_from_node)
            print "PR_score: " + str(self.PR_score)
            print "ttl_time: " + str(self.ttl_time)
            print "last_delta: " + str(self.last_delta)

def crawl(url, url_ttl):
        """crawl to new url, if ttl == 0 max depth reached, don't visit same url twice"""
        if url_ttl > 0 and (url not in visited_urls):
            # create new node p from parsed page
            print "crawling to " + url + "...n"
            res = requests.get(url)
            doc = lxml.html.fromstring(res.content)
            p = pr_node(url, url_ttl)
            # add new PR node
            global pr_nodes
            pr_nodes[url] = p
            # get all wiki links, format to legit URL
            all_links_to_node = set([])
            for t in doc.xpath("//a[contains(@href, '/wiki/')]"):
                add_val = ""
                if not t.attrib['href'].startswith("http://") and t.attrib['href'].startswith("/wiki/"):
                    add_val = "http://en.m.wikipedia.org" + t.attrib['href']
                    all_links_to_node.add(add_val)
                elif t.attrib['href'].startswith("http://"):
                    add_val = t.attrib['href']
                    all_links_to_node.add(add_val)
                else:
                    pass
            # select random 10 of them and crawl to them
            iter_count = 0
            iter_stop_lim = 10
            while iter_count < iter_stop_lim and len(p.links_to_node) < iter_stop_lim and len(all_links_to_node) > 0:
                    # sample random site of linked sites
                    current_url = random.sample(all_links_to_node, 1)[0]
                    # don't sample it twice...
                    all_links_to_node.remove(current_url)
                    iter_count = + 1
                    # crawl if hav'nt been there and TTL enables you to check it
                    if not (current_url in visited_urls) and url_ttl > 1:
                        p.links_to_node.add(current_url)
                        crawl(current_url, url_ttl - 1)
                        visited_urls.add(url)
                    # if reached with TTL == 1 just check links to existing nodes
                    elif current_url in visited_urls and url_ttl == 1:
                        p.links_to_node.add(current_url)
        else:
            print "max depth reached or you've already been here"
        return

def calc_graph_pr(pr_nodes, pr_iter_count, damp_factor):
    """calculate and print the graph's PageRank"""
    current_iter = 0
    # use two graph copies to prevent auto-interference
    g1 = {}
    g2 = {}
    for node in pr_nodes.itervalues():
        g1[node.url] = node
        g2[node.url] = node
    g = [g1, g2]
    # do actual page rank here
    while current_iter < pr_iter_count:
        for p in g[current_iter % 2].itervalues():
            in_links_addition = 0
            # iterate over all pointing nodes and sum their PR/out_link_count
            for l in p.links_to_node:
                l_val = g[(current_iter - 1) % 2][l]
                l_val.delta = l_val.PR_score - g[current_iter % 2][l].PR_score
                in_links_addition += l_val.PR_score/len(l_val.links_from_node)
            # update w.r.t the computed sum and damp_factor
            p.PR_score = damp_factor + (1 - damp_factor) * in_links_addition
        current_iter += 1
    # WLOG could be also g[1]...
    pr_nodes = g[0]
    for p in pr_nodes.itervalues():
        p.print_node_pr()
    return

def update_graph_links():
    """register each node with neighbours pointing at him"""
    global pr_nodes
    for node in pr_nodes.itervalues():
        for u in node.links_to_node:
            if u in pr_nodes:
                pr_nodes[u].links_from_node.add(node.url)
    return

if __name__ == '__main__':
    urlToCrawl = "http://en.m.wikipedia.org/wiki/Nikola_Mitrovic"
    # crawl to the requested site as default
    if len(sys.argv) > 2:
        sys.exit("Unexpected input")
    elif len(sys.argv) == 1:
        pass
    else:
        urlToCrawl = sys.argv[1]
    print_q1a = False
    print_q1b = True
    # set global data structures for crawling and ranking
    visited_urls = set([])
    pr_nodes = {}
    # parameters for PageRank
    glob_pr_iter_count = 100
    glob_damp_factor = 0.2
    # perform crawl in depth 3
    crawl(urlToCrawl, 3)
    if print_q1a:
        for p in pr_nodes.itervalues():
            p.print_node_out_links()
    elif print_q1b:
        # first update the backlinks then start ranking
        update_graph_links()
        calc_graph_pr(pr_nodes, glob_pr_iter_count, glob_damp_factor)
    else:
        pass

相关内容

最新更新

热门标签：