我收到这个奇怪的错误消息
15/01/26 13:05:12 INFO spark.SparkContext: Created broadcast 0 from wholeTextFiles at NativeMethodAccessorImpl.java:-2
Traceback (most recent call last):
File "/home/user/inverted-index.py", line 78, in <module>
print sc.wholeTextFiles(data_dir).flatMap(update).top(10)#groupByKey().map(store)
File "/home/user/spark2/python/pyspark/rdd.py", line 1045, in top
return self.mapPartitions(topIterator).reduce(merge)
File "/home/user/spark2/python/pyspark/rdd.py", line 715, in reduce
vals = self.mapPartitions(func).collect()
File "/home/user/spark2/python/pyspark/rdd.py", line 676, in collect
bytesInJava = self._jrdd.collect().iterator()
File "/home/user/spark2/python/pyspark/rdd.py", line 2107, in _jrdd
pickled_command = ser.dumps(command)
File "/home/user/spark2/python/pyspark/serializers.py", line 402, in dumps
return cloudpickle.dumps(obj, 2)
File "/home/user/spark2/python/pyspark/cloudpickle.py", line 816, in dumps
cp.dump(obj)
File "/home/user/spark2/python/pyspark/cloudpickle.py", line 133, in dump
return pickle.Pickler.dump(self, obj)
File "/usr/lib/python2.7/pickle.py", line 224, in dump
self.save(obj)
File "/usr/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/lib/python2.7/pickle.py", line 562, in save_tuple
save(element)
File "/usr/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/home/user/spark2/python/pyspark/cloudpickle.py", line 254, in save_function
self.save_function_tuple(obj, [themodule])
File "/home/user/spark2/python/pyspark/cloudpickle.py", line 304, in save_function_tuple
save((code, closure, base_globals))
File "/usr/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/lib/python2.7/pickle.py", line 548, in save_tuple
save(element)
File "/usr/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/lib/python2.7/pickle.py", line 600, in save_list
self._batch_appends(iter(obj))
File "/usr/lib/python2.7/pickle.py", line 633, in _batch_appends
save(x)
File "/usr/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/home/user/spark2/python/pyspark/cloudpickle.py", line 254, in save_function
self.save_function_tuple(obj, [themodule])
File "/home/user/spark2/python/pyspark/cloudpickle.py", line 304, in save_function_tuple
save((code, closure, base_globals))
File "/usr/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/lib/python2.7/pickle.py", line 548, in save_tuple
save(element)
File "/usr/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/lib/python2.7/pickle.py", line 600, in save_list
self._batch_appends(iter(obj))
File "/usr/lib/python2.7/pickle.py", line 633, in _batch_appends
save(x)
File "/usr/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/home/user/spark2/python/pyspark/cloudpickle.py", line 254, in save_function
self.save_function_tuple(obj, [themodule])
File "/home/user/spark2/python/pyspark/cloudpickle.py", line 304, in save_function_tuple
save((code, closure, base_globals))
File "/usr/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/lib/python2.7/pickle.py", line 548, in save_tuple
save(element)
File "/usr/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/lib/python2.7/pickle.py", line 600, in save_list
self._batch_appends(iter(obj))
File "/usr/lib/python2.7/pickle.py", line 636, in _batch_appends
save(tmp[0])
File "/usr/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/home/user/spark2/python/pyspark/cloudpickle.py", line 249, in save_function
self.save_function_tuple(obj, modList)
File "/home/user/spark2/python/pyspark/cloudpickle.py", line 309, in save_function_tuple
save(f_globals)
File "/usr/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/home/user/spark2/python/pyspark/cloudpickle.py", line 174, in save_dict
pickle.Pickler.save_dict(self, obj)
File "/usr/lib/python2.7/pickle.py", line 649, in save_dict
self._batch_setitems(obj.iteritems())
File "/usr/lib/python2.7/pickle.py", line 681, in _batch_setitems
save(v)
File "/usr/lib/python2.7/pickle.py", line 331, in save
self.save_reduce(obj=obj, *rv)
File "/home/user/spark2/python/pyspark/cloudpickle.py", line 650, in save_reduce
save(state)
File "/usr/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/home/user/spark2/python/pyspark/cloudpickle.py", line 174, in save_dict
pickle.Pickler.save_dict(self, obj)
File "/usr/lib/python2.7/pickle.py", line 649, in save_dict
self._batch_setitems(obj.iteritems())
File "/usr/lib/python2.7/pickle.py", line 681, in _batch_setitems
save(v)
File "/usr/lib/python2.7/pickle.py", line 331, in save
self.save_reduce(obj=obj, *rv)
File "/home/user/spark2/python/pyspark/cloudpickle.py", line 650, in save_reduce
save(state)
File "/usr/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/home/user/spark2/python/pyspark/cloudpickle.py", line 174, in save_dict
pickle.Pickler.save_dict(self, obj)
File "/usr/lib/python2.7/pickle.py", line 649, in save_dict
self._batch_setitems(obj.iteritems())
File "/usr/lib/python2.7/pickle.py", line 681, in _batch_setitems
save(v)
File "/usr/lib/python2.7/pickle.py", line 331, in save
self.save_reduce(obj=obj, *rv)
File "/home/user/spark2/python/pyspark/cloudpickle.py", line 650, in save_reduce
save(state)
File "/usr/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/home/user/spark2/python/pyspark/cloudpickle.py", line 174, in save_dict
pickle.Pickler.save_dict(self, obj)
File "/usr/lib/python2.7/pickle.py", line 649, in save_dict
self._batch_setitems(obj.iteritems())
File "/usr/lib/python2.7/pickle.py", line 681, in _batch_setitems
save(v)
File "/usr/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/home/user/spark2/python/pyspark/cloudpickle.py", line 547, in save_inst
self.save_inst_logic(obj)
File "/home/user/spark2/python/pyspark/cloudpickle.py", line 537, in save_inst_logic
save(stuff)
File "/usr/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/home/user/spark2/python/pyspark/cloudpickle.py", line 174, in save_dict
pickle.Pickler.save_dict(self, obj)
File "/usr/lib/python2.7/pickle.py", line 649, in save_dict
self._batch_setitems(obj.iteritems())
File "/usr/lib/python2.7/pickle.py", line 681, in _batch_setitems
save(v)
File "/usr/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/home/user/spark2/python/pyspark/cloudpickle.py", line 547, in save_inst
self.save_inst_logic(obj)
File "/home/user/spark2/python/pyspark/cloudpickle.py", line 537, in save_inst_logic
save(stuff)
File "/usr/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/home/user/spark2/python/pyspark/cloudpickle.py", line 174, in save_dict
pickle.Pickler.save_dict(self, obj)
File "/usr/lib/python2.7/pickle.py", line 649, in save_dict
self._batch_setitems(obj.iteritems())
File "/usr/lib/python2.7/pickle.py", line 681, in _batch_setitems
save(v)
File "/usr/lib/python2.7/pickle.py", line 331, in save
self.save_reduce(obj=obj, *rv)
File "/home/user/spark2/python/pyspark/cloudpickle.py", line 616, in save_reduce
save(cls)
File "/usr/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/home/user/spark2/python/pyspark/cloudpickle.py", line 467, in save_global
d),obj=obj)
File "/home/user/spark2/python/pyspark/cloudpickle.py", line 631, in save_reduce
save(args)
File "/usr/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/lib/python2.7/pickle.py", line 548, in save_tuple
save(element)
File "/usr/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/home/user/spark2/python/pyspark/cloudpickle.py", line 174, in save_dict
pickle.Pickler.save_dict(self, obj)
File "/usr/lib/python2.7/pickle.py", line 649, in save_dict
self._batch_setitems(obj.iteritems())
File "/usr/lib/python2.7/pickle.py", line 681, in _batch_setitems
save(v)
File "/usr/lib/python2.7/pickle.py", line 331, in save
self.save_reduce(obj=obj, *rv)
File "/home/user/spark2/python/pyspark/cloudpickle.py", line 616, in save_reduce
save(cls)
File "/usr/lib/python2.7/pickle.py", line 286, in save
f(self, obj) # Call unbound method with explicit self
File "/home/user/spark2/python/pyspark/cloudpickle.py", line 442, in save_global
raise pickle.PicklingError("Can't pickle builtin %s" % obj)
pickle.PicklingError: Can't pickle builtin <type 'method_descriptor'>
我的更新函数返回类型为 (key, (value1, value2))
的元组列表,所有这些元组都是字符串,如下所示:
def update(doc):
doc_id = doc[0][path_len:-ext_len] #actual file name
content = doc[1].lower()
new_fi = regex.split(content)
old_fi = fi_table.row(doc_id)
fi_table.put(doc_id, {'cf:col': ",".join(new_fi)})
if not old_fi:
return [(term, ('add', doc_id)) for term in new_fi]
else:
new_fi = set(new_fi)
old_fi = set(old_fi['cf:col'].split(','))
return [(term, ('add', doc_id)) for term in new_fi - old_fi] +
[(term, ('del', doc_id)) for term in old_fi - new_fi]
编辑:问题出在这两个 hbase 函数上,即行和放置。当我注释它们时,代码都可以工作(将old_fi设置为空字典),但如果其中一个运行,则会产生上述错误。我使用 happybase 在 python 中操作 hbase。有人可以解释我出了什么问题吗?
Spark 尝试序列化连接对象,以便它可以在执行器内部使用,这肯定会失败,因为反序列化的数据库连接对象无法向另一个范围(甚至计算机)授予读/写权限。通过尝试广播连接对象,可以重现此问题。对于此实例,序列化 I/O 对象时出现问题。
通过连接到映射函数中的数据库,该问题得到了部分解决。由于map函数中的每个RDD元素都有太多连接,我不得不切换到分区处理,以将数据库连接从20k减少到大约8-64(基于分区数)。Spark 开发人员应考虑为执行程序创建一个初始化函数/脚本,以避免此类死胡同问题。
因此,假设我让每个节点都执行了这个 init 函数,然后每个节点都将连接到数据库(一些 conn 池或单独的 zookeeper 节点),因为 init 函数和 map 函数将共享相同的范围,然后问题就消失了,所以你写的代码比我找到的解决方法更快。在执行结束时,火花将释放/卸载这些定义的变量,程序将结束。
的酸洗问题,你可以注册如何腌制 MethodDescriptorType,如下所示:
def _getattr(objclass, name, repr_str):
# hack to grab the reference directly
try:
attr = repr_str.split("'")[3]
return eval(attr+'.__dict__["'+name+'"]')
except:
attr = getattr(objclass,name)
if name == '__dict__':
attr = attr[name]
return attar
def save_wrapper_descriptor(pickler, obj):
pickler = Pickler(file, protocol)
pickler.save_reduce(_getattr, (obj.__objclass__, obj.__name__,
obj.__repr__()), obj=obj)
return
# register the following "type" with:
# Pickler.dispatch[MethodDescriptorType] = save_wrapper_descriptor
MethodDescriptorType = type(type.__dict__['mro'])
然后,如果将上述内容注册到spark
使用的酸洗调度表(如上所示,或带有copy_reg
),则可能会通过酸洗错误。