类变量dictionary未在python 2.7中使用pickle.dump保存



我使用pickle通过转储根来保存对象图。当我加载根时,它有所有的实例变量和连接的对象节点。然而,我将所有节点保存在dictionary类型的类变量中。类变量在保存之前是满的,但在我取消拾取数据之后,它是空的。

这是我正在使用的类:

class Page():
    __crawled = {}
    def __init__(self, title = '', link = '', relatedURLs = []):
        self.__title = title
        self.__link = link
        self.__relatedURLs = relatedURLs
        self.__related = [] 
    @property
    def relatedURLs(self):
        return self.__relatedURLs
    @property
    def title(self):
        return self.__title
    @property
    def related(self):
        return self.__related
    @property
    def crawled(self):
        return self.__crawled
    def crawl(self,url):
        if url not in self.__crawled:
            webpage = urlopen(url).read()
            patFinderTitle = re.compile('<title>(.*)</title>')
            patFinderLink = re.compile('<link rel="canonical" href="([^"]*)" />')
            patFinderRelated = re.compile('<li><a href="([^"]*)"')
            findPatTitle = re.findall(patFinderTitle, webpage)
            findPatLink = re.findall(patFinderLink, webpage)
            findPatRelated = re.findall(patFinderRelated, webpage)
            newPage = Page(findPatTitle,findPatLink,findPatRelated)
            self.__related.append(newPage)
            self.__crawled[url] = newPage
        else:
            self.__related.append(self.__crawled[url])
    def crawlRelated(self):
        for link in self.__relatedURLs:
            self.crawl(link)

我这样保存:

with open('medTwiceGraph.dat','w') as outf:
    pickle.dump(root,outf)

我这样加载:

def loadGraph(filename): #returns root
    with open(filename,'r') as inf:
        return pickle.load(inf)
root = loadGraph('medTwiceGraph.dat')

除了类变量__crawled之外的所有数据加载。

我做错了什么?

Python并没有真正pickle类对象。它只是保存他们的名字以及在哪里可以找到他们。来自pickle:的文档

类似地,类通过命名引用进行pickle,因此取消拾取环境中的限制适用。请注意没有类的代码或数据被pickle,因此在下面的示例中类属性attr未在取消拾取环境中恢复:

class Foo:
    attr = 'a class attr'
picklestring = pickle.dumps(Foo)

这些限制就是可拾取函数和类必须定义在模块的顶层。

类似地,当类实例被pickle时,它们的类代码和数据不会随它们一起腌制。只有实例数据腌制。这是有目的的,所以你可以修复类或将方法添加到类中,并且仍然加载使用创建的对象该类的早期版本。如果你打算长寿对象将看到一个类的许多版本,这可能是值得的在对象中放入版本号,以便进行适当的转换可以通过类的CCD_ 3方法进行。

在您的示例中,您可以解决将__crawled更改为实例属性或全局变量的问题。

默认情况下,pickle将只使用self.__dict__的内容,而不使用您认为想要的self.__class__.__dict__

我说,"你认为你想要什么",因为取消拾取一个实例不应该改变类级别的状态。

如果您想更改此行为,请查看文档

中的__getstate____setstate__

对于任何感兴趣的人来说,我所做的是创建一个包含实例变量__crawled的超类Graph,并将我的爬网函数移动到Graph中。页面现在只包含描述页面及其相关页面的属性。我pickle我的Graph实例,它包含了我所有的Page实例。这是我的密码。

from urllib import urlopen
#from bs4 import BeautifulSoup
import re
import pickle
###################CLASS GRAPH####################
class Graph(object):
    def __init__(self,roots = [],crawled = {}):
        self.__roots = roots
        self.__crawled = crawled
    @property
    def roots(self):
        return self.__roots
    @property
    def crawled(self):
        return self.__crawled
    def crawl(self,page,url):
        if url not in self.__crawled:
            webpage = urlopen(url).read()
            patFinderTitle = re.compile('<title>(.*)</title>')
            patFinderLink = re.compile('<link rel="canonical" href="([^"]*)" />')
            patFinderRelated = re.compile('<li><a href="([^"]*)"')
            findPatTitle = re.findall(patFinderTitle, webpage)
            findPatLink = re.findall(patFinderLink, webpage)
            findPatRelated = re.findall(patFinderRelated, webpage)
            newPage = Page(findPatTitle,findPatLink,findPatRelated)
            page.related.append(newPage)
            self.__crawled[url] = newPage
        else:
            page.related.append(self.__crawled[url])
    def crawlRelated(self,page):
        for link in page.relatedURLs:
            self.crawl(page,link)
    def crawlAll(self,obj,limit = 2,i = 0):
        print 'number of crawled pages:', len(self.crawled)
        i += 1
        if i > limit:
            return
        else:
            for rel in obj.related:
                print 'crawling', rel.title
                self.crawlRelated(rel)
            for rel2 in obj.related:
                self.crawlAll(rel2,limit,i)          
    def loadGraph(self,filename):
        with open(filename,'r') as inf:
            return pickle.load(inf)
    def saveGraph(self,obj,filename):
        with open(filename,'w') as outf:
            pickle.dump(obj,outf)
###################CLASS PAGE#####################
class Page(Graph):
    def __init__(self, title = '', link = '', relatedURLs = []):
        self.__title = title
        self.__link = link
        self.__relatedURLs = relatedURLs
        self.__related = []      
    @property
    def relatedURLs(self):
        return self.__relatedURLs 
    @property
    def title(self):
        return self.__title
    @property
    def related(self):
        return self.__related
####################### MAIN ######################
def main(seed):
    print 'doing some work...'
    webpage = urlopen(seed).read()
    patFinderTitle = re.compile('<title>(.*)</title>')
    patFinderLink = re.compile('<link rel="canonical" href="([^"]*)" />')
    patFinderRelated = re.compile('<li><a href="([^"]*)"')
    findPatTitle = re.findall(patFinderTitle, webpage)
    findPatLink = re.findall(patFinderLink, webpage)
    findPatRelated = re.findall(patFinderRelated, webpage)
    print 'found the webpage', findPatTitle
    #root = Page(findPatTitle,findPatLink,findPatRelated)
    G = Graph([Page(findPatTitle,findPatLink,findPatRelated)])
    print 'crawling related...'
    G.crawlRelated(G.roots[0])
    G.crawlAll(G.roots[0])  
    print 'now saving...'
    G.saveGraph(G, 'medTwiceGraph.dat')
    print 'done'
    return G
#####################END MAIN######################
#'http://medtwice.com/am-i-pregnant/'
#'medTwiceGraph.dat'
#G = main('http://medtwice.com/menopause-overview/')
#print G.crawled

def loadGraph(filename):
    with open(filename,'r') as inf:
        return pickle.load(inf)
G = loadGraph('MedTwiceGraph.dat')
print G.roots[0].title
print G.roots[0].related
print G.crawled
for key in G.crawled:
    print G.crawled[key].title

使用dill可以解决这个问题
dill包:https://pypi.python.org/pypi/dill
参考:https://stackoverflow.com/a/28543378/6301132

根据Asker的代码,进入这个:

#notice:open the file in binary require
#save
with open('medTwiceGraph.dat','wb') as outf:
    dill.dump(root,outf)
#load
def loadGraph(filename): #returns root
    with open(filename,'rb') as inf:
        return dill.load(inf)
root = loadGraph('medTwiceGraph.dat')

我写了另一个例子:

#Another example (with Python 3.x)
import dill
import os
class Employee: 
    def __init__ (self ,name='',contact={}) :
        self.name = name
        self.contact = contact
    def print_self(self):
        print(self.name, self.contact)
#save
def save_employees():
    global emp
    with open('employees.dat','wb') as fh:
        dill.dump(emp,fh)
#load
def load_employees():
    global emp
    if os.path.exists('employees.dat'):
        with open('employees.dat','rb') as fh:
            emp=dill.load(fh)
#---
emp=[]
load_employees()
print('loaded:')
for tmpe in emp:
    tmpe.print_self()
e=Employee() #new employee
if len(emp)==0:
    e.name='Jack'
    e.contact={'phone':'+086-12345678'}
elif len(emp)==1:
    e.name='Jane'
    e.contact={'phone':'+01-15555555','email':'a@b.com'}
else:
    e.name='sb.'
    e.contact={'telegram':'x'}
emp.append(e)
save_employees()

最新更新