我安装了以下Pydeequ:
在一个anaconda环境中,我安装了pyspark 3.0.0、pydeequ上一版本和sagemaker_pyspark上一版本。
from pyspark.sql import SparkSession
import os
os.environ["SPARK_VERSION"] = r"3.0.0"
import pydeequ
import sagemaker_pyspark
from pyspark.sql import SparkSession, Row
from pydeequ.analyzers import *
classpath = ":".join(sagemaker_pyspark.classpath_jars()) # aws-specific jars
spark = (SparkSession
.builder
.config("spark.driver.extraClassPath", classpath)
.config("spark.jars.packages", pydeequ.deequ_maven_coord)
.config("spark.jars.excludes", pydeequ.f2j_maven_coord)
.getOrCreate())
df = spark.read.option("header","true").csv('landing/persistent/chocolate_part_1.csv')
这是有效的,类AnalysisRunner
也是有效的。然而,使用以下片段:
from pydeequ.suggestions import *
suggestionResult = ConstraintSuggestionRunner(spark)
.onData(df)
.addConstraintRule(DEFAULT())
.run()
# Constraint Suggestions in JSON format
print(suggestionResult)
我得到以下错误:
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
Input In [4], in <cell line: 3>()
1 from pydeequ.suggestions import *
----> 3 suggestionResult = ConstraintSuggestionRunner(spark)
4 .onData(df)
5 .addConstraintRule(DEFAULT())
6 .run()
8 # Constraint Suggestions in JSON format
9 print(suggestionResult)
File ~/opt/anaconda3/envs/ADSDB/lib/python3.9/site-packages/pydeequ/suggestions.py:81, in ConstraintSuggestionRunBuilder.run(self)
74 def run(self):
75 """
76 A method that runs the desired ConstraintSuggestionRunBuilder functions on the data to obtain a constraint
77 suggestion result. The result is then translated to python.
78
79 :return: A constraint suggestion result
80 """
---> 81 result = self._ConstraintSuggestionRunBuilder.run()
83 jvmSuggestionResult = self._jvm.com.amazon.deequ.suggestions.ConstraintSuggestionResult
84 result_json = json.loads(jvmSuggestionResult.getConstraintSuggestionsAsJson(result))
File ~/opt/anaconda3/envs/ADSDB/lib/python3.9/site-packages/py4j/java_gateway.py:1321, in JavaMember.__call__(self, *args)
1315 command = proto.CALL_COMMAND_NAME +
1316 self.command_header +
1317 args_command +
1318 proto.END_COMMAND_PART
1320 answer = self.gateway_client.send_command(command)
-> 1321 return_value = get_return_value(
1322 answer, self.gateway_client, self.target_id, self.name)
1324 for temp_arg in temp_args:
1325 temp_arg._detach()
File ~/opt/anaconda3/envs/ADSDB/lib/python3.9/site-packages/pyspark/sql/utils.py:190, in capture_sql_exception.<locals>.deco(*a, **kw)
188 def deco(*a: Any, **kw: Any) -> Any:
189 try:
--> 190 return f(*a, **kw)
191 except Py4JJavaError as e:
192 converted = convert_exception(e.java_exception)
File ~/opt/anaconda3/envs/ADSDB/lib/python3.9/site-packages/py4j/protocol.py:326, in get_return_value(answer, gateway_client, target_id, name)
324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client)
325 if answer[1] == REFERENCE_TYPE:
--> 326 raise Py4JJavaError(
327 "An error occurred while calling {0}{1}{2}.n".
328 format(target_id, ".", name), value)
329 else:
330 raise Py4JError(
331 "An error occurred while calling {0}{1}{2}. Trace:n{3}n".
332 format(target_id, ".", name, value))
Py4JJavaError: An error occurred while calling o49.run.
: java.lang.NoSuchMethodError: 'org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression org.apache.spark.sql.catalyst.expressions.aggregate.AggregateFunction.toAggregateExpression(boolean)'
at org.apache.spark.sql.DeequFunctions$.withAggregateFunction(DeequFunctions.scala:31)
at org.apache.spark.sql.DeequFunctions$.stateful_approx_count_distinct(DeequFunctions.scala:60)
at com.amazon.deequ.analyzers.ApproxCountDistinct.aggregationFunctions(ApproxCountDistinct.scala:52)
at com.amazon.deequ.analyzers.runners.AnalysisRunner$.$anonfun$runScanningAnalyzers$3(AnalysisRunner.scala:319)
at scala.collection.immutable.List.flatMap(List.scala:366)
at com.amazon.deequ.analyzers.runners.AnalysisRunner$.liftedTree1$1(AnalysisRunner.scala:319)
at com.amazon.deequ.analyzers.runners.AnalysisRunner$.runScanningAnalyzers(AnalysisRunner.scala:318)
at com.amazon.deequ.analyzers.runners.AnalysisRunner$.doAnalysisRun(AnalysisRunner.scala:167)
at com.amazon.deequ.analyzers.runners.AnalysisRunBuilder.run(AnalysisRunBuilder.scala:110)
at com.amazon.deequ.profiles.ColumnProfiler$.profile(ColumnProfiler.scala:141)
at com.amazon.deequ.profiles.ColumnProfilerRunner.run(ColumnProfilerRunner.scala:72)
at com.amazon.deequ.profiles.ColumnProfilerRunBuilder.run(ColumnProfilerRunBuilder.scala:185)
at com.amazon.deequ.suggestions.ConstraintSuggestionRunner.profileAndSuggest(ConstraintSuggestionRunner.scala:203)
at com.amazon.deequ.suggestions.ConstraintSuggestionRunner.run(ConstraintSuggestionRunner.scala:102)
at com.amazon.deequ.suggestions.ConstraintSuggestionRunBuilder.run(ConstraintSuggestionRunBuilder.scala:226)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:566)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.base/java.lang.Thread.run(Thread.java:829)
关于为什么会发生这种情况,有什么线索吗?我使用的是M1 macbook,但我的windows环境也有类似的问题。
您尝试过github.com/canimus/cuallee
吗不需要任何额外罐子的pydeequ的替代品在M1上进行了测试,它的运行速度比pydeequ快2倍?