尝试在cudf上使用函数使用.apply()
为数据帧中的新列创建值import cudf
import numpy as np
import pandas as pd
import sys
sys.version
> '3.9.15 | packaged by conda-forge | (main, Nov 22 2022, 08:45:29) n[GCC 10.4.0]'
gdf = cudf.read_csv('datav3.csv')
def Rival(row):
if row['event_name'] == 'HOU21':
return 1
if row['event_name'] == 'TEN21':
return 1
if row['event_name'] == 'JAX21':
return 1
if row['event_name'] == 'HOU22':
return 1
if row['event_name'] == 'TEN22':
return 1
if row['event_name'] == 'JAX22':
return 1
else:
return 0
gdf['rival'] = gdf.apply(lambda row: Rival(row), axis = 1)
下面是python输出的错误。这是一本值得信赖的笔记本。这个函数在pandas数据框架上工作得很好,但现在在cudf上出现了问题,我不完全确定原因。我使用的rapidsai版本是rapids 22.12
> ---------------------------------------------------------------------------
> TypingError Traceback (most recent call last)
> File ~/miniconda3/envs/rapids-22.12/lib/python3.9/site-packages/cudf/core/indexed_frame.py:2014, in IndexedFrame._apply(self, func, kernel_getter, *args, **kwargs)
> 2013 try:
> 2014 kernel, retty = _compile_or_get(
> 2015 self, func, args, kernel_getter=kernel_getter
> 2016 )
> 2017 except Exception as e:
>
> File ~/miniconda3/envs/rapids-22.12/lib/python3.9/contextlib.py:79, in ContextDecorator.__call__.<locals>.inner(*args, **kwds)
> 78 with self._recreate_cm():
> 79 return func(*args, **kwds)
>
> File ~/miniconda3/envs/rapids-22.12/lib/python3.9/site-packages/cudf/core/udf/utils.py:222, in _compile_or_get(frame, func, args, kernel_getter)
> 219 # precompile the user udf to get the right return type.
> 220 # could be a MaskedType or a scalar type.
> 222 kernel, scalar_return_type = kernel_getter(frame, func, args)
> 223 np_return_type = (
> 224 numpy_support.as_dtype(scalar_return_type)
> 225 if scalar_return_type.is_internal
> 226 else scalar_return_type.np_dtype
> 227 )
>
> File ~/miniconda3/envs/rapids-22.12/lib/python3.9/site-packages/cudf/core/udf/row_function.py:133, in _get_row_kernel(frame, func, args)
> 130 row_type = _get_frame_row_type(
> 131 np.dtype(list(_all_dtypes_from_frame(frame).items()))
> 132 )
> 133 scalar_return_type = _get_udf_return_type(row_type, func, args)
> 134 # this is the signature for the final full kernel compilation
>
> File ~/miniconda3/envs/rapids-22.12/lib/python3.9/contextlib.py:79, in ContextDecorator.__call__.<locals>.inner(*args, **kwds)
> 78 with self._recreate_cm():
> 79 return func(*args, **kwds)
>
> File ~/miniconda3/envs/rapids-22.12/lib/python3.9/site-packages/cudf/core/udf/utils.py:61, in _get_udf_return_type(argty, func, args)
> 59 # Get the return type. The PTX is also returned by compile_udf, but is not
> 60 # needed here.
> 61 ptx, output_type = cudautils.compile_udf(func, compile_sig)
> 63 if not isinstance(output_type, MaskedType):
>
> File ~/miniconda3/envs/rapids-22.12/lib/python3.9/site-packages/cudf/utils/cudautils.py:250, in compile_udf(udf, type_signature)
> 248 # We haven't compiled a function like this before, so need to fall back to
> 249 # compilation with Numba
> 250 ptx_code, return_type = cuda.compile_ptx_for_current_device(
> 251 udf, type_signature, device=True
> 252 )
> 253 if not isinstance(return_type, cudf.core.udf.masked_typing.MaskedType):
>
> File ~/miniconda3/envs/rapids-22.12/lib/python3.9/site-packages/numba/cuda/compiler.py:293, in compile_ptx_for_current_device(pyfunc, args, debug, lineinfo, device, fastmath, opt)
> 292 cc = get_current_device().compute_capability
> 293 return compile_ptx(pyfunc, args, debug=debug, lineinfo=lineinfo,
> 294 device=device, fastmath=fastmath, cc=cc, opt=True)
>
> File ~/miniconda3/envs/rapids-22.12/lib/python3.9/site-packages/numba/core/compiler_lock.py:35, in _CompilerLock.__call__.<locals>._acquire_compile_lock(*args, **kwargs)
> 34 with self:
> 35 return func(*args, **kwargs)
>
> File ~/miniconda3/envs/rapids-22.12/lib/python3.9/site-packages/numba/cuda/compiler.py:269, in compile_ptx(pyfunc, args, debug, lineinfo, device, fastmath, cc, opt)
> 262 nvvm_options = {
> 263 'debug': debug,
> 264 'lineinfo': lineinfo,
> 265 'fastmath': fastmath,
> 266 'opt': 3 if opt else 0
> 267 }
> 269 cres = compile_cuda(pyfunc, None, args, debug=debug, lineinfo=lineinfo,
> 270 fastmath=fastmath,
> 271 nvvm_options=nvvm_options)
> 272 resty = cres.signature.return_type
>
> File ~/miniconda3/envs/rapids-22.12/lib/python3.9/site-packages/numba/core/compiler_lock.py:35, in _CompilerLock.__call__.<locals>._acquire_compile_lock(*args, **kwargs)
> 34 with self:
> 35 return func(*args, **kwargs)
>
> File ~/miniconda3/envs/rapids-22.12/lib/python3.9/site-packages/numba/cuda/compiler.py:212, in compile_cuda(pyfunc, return_type, args, debug, lineinfo, inline, fastmath, nvvm_options)
> 211 with target_override('cuda'):
> 212 cres = compiler.compile_extra(typingctx=typingctx,
> 213 targetctx=targetctx,
> 214 func=pyfunc,
> 215 args=args,
> 216 return_type=return_type,
> 217 flags=flags,
> 218 locals={},
> 219 pipeline_class=CUDACompiler)
> 221 library = cres.library
>
> File ~/miniconda3/envs/rapids-22.12/lib/python3.9/site-packages/numba/core/compiler.py:716, in compile_extra(typingctx, targetctx, func, args, return_type, flags, locals, library, pipeline_class)
> 714 pipeline = pipeline_class(typingctx, targetctx, library,
> 715 args, return_type, flags, locals)
> 716 return pipeline.compile_extra(func)
>
> File ~/miniconda3/envs/rapids-22.12/lib/python3.9/site-packages/numba/core/compiler.py:452, in CompilerBase.compile_extra(self, func)
> 451 self.state.lifted_from = None
> 452 return self._compile_bytecode()
>
> File ~/miniconda3/envs/rapids-22.12/lib/python3.9/site-packages/numba/core/compiler.py:520, in CompilerBase._compile_bytecode(self)
> 519 assert self.state.func_ir is None
> 520 return self._compile_core()
>
> File ~/miniconda3/envs/rapids-22.12/lib/python3.9/site-packages/numba/core/compiler.py:499, in CompilerBase._compile_core(self)
> 498 if is_final_pipeline:
> 499 raise e
> 500 else:
>
> File ~/miniconda3/envs/rapids-22.12/lib/python3.9/site-packages/numba/core/compiler.py:486, in CompilerBase._compile_core(self)
> 485 try:
> 486 pm.run(self.state)
> 487 if self.state.cr is not None:
>
> File ~/miniconda3/envs/rapids-22.12/lib/python3.9/site-packages/numba/core/compiler_machinery.py:368, in PassManager.run(self, state)
> 367 patched_exception = self._patch_error(msg, e)
> 368 raise patched_exception
>
> File ~/miniconda3/envs/rapids-22.12/lib/python3.9/site-packages/numba/core/compiler_machinery.py:356, in PassManager.run(self, state)
> 355 if isinstance(pass_inst, CompilerPass):
> 356 self._runPass(idx, pass_inst, state)
> 357 else:
>
> File ~/miniconda3/envs/rapids-22.12/lib/python3.9/site-packages/numba/core/compiler_lock.py:35, in _CompilerLock.__call__.<locals>._acquire_compile_lock(*args, **kwargs)
> 34 with self:
> 35 return func(*args, **kwargs)
>
> File ~/miniconda3/envs/rapids-22.12/lib/python3.9/site-packages/numba/core/compiler_machinery.py:311, in PassManager._runPass(self, index, pss, internal_state)
> 310 with SimpleTimer() as pass_time:
> 311 mutated |= check(pss.run_pass, internal_state)
> 312 with SimpleTimer() as finalize_time:
>
> File ~/miniconda3/envs/rapids-22.12/lib/python3.9/site-packages/numba/core/compiler_machinery.py:273, in PassManager._runPass.<locals>.check(func, compiler_state)
> 272 def check(func, compiler_state):
> 273 mangled = func(compiler_state)
> 274 if mangled not in (True, False):
>
> File ~/miniconda3/envs/rapids-22.12/lib/python3.9/site-packages/numba/core/typed_passes.py:105, in BaseTypeInference.run_pass(self, state)
> 102 with fallback_context(state, 'Function "%s" failed type inference'
> 103 % (state.func_id.func_name,)):
> 104 # Type inference
> 105 typemap, return_type, calltypes, errs = type_inference_stage(
> 106 state.typingctx,
> 107 state.targetctx,
> 108 state.func_ir,
> 109 state.args,
> 110 state.return_type,
> 111 state.locals,
> 112 raise_errors=self._raise_errors)
> 113 state.typemap = typemap
>
> File ~/miniconda3/envs/rapids-22.12/lib/python3.9/site-packages/numba/core/typed_passes.py:81, in type_inference_stage(typingctx, targetctx, interp, args, return_type, locals, raise_errors)
> 79 infer.seed_type(k, v)
> 81 infer.build_constraint()
> 82 # return errors in case of partial typing
>
> File ~/miniconda3/envs/rapids-22.12/lib/python3.9/site-packages/numba/core/typeinfer.py:1039, in TypeInferer.build_constraint(self)
> 1038 for inst in blk.body:
> 1039 self.constrain_statement(inst)
>
> File ~/miniconda3/envs/rapids-22.12/lib/python3.9/site-packages/numba/core/typeinfer.py:1386, in TypeInferer.constrain_statement(self, inst)
> 1385 if isinstance(inst, ir.Assign):
> 1386 self.typeof_assign(inst)
> 1387 elif isinstance(inst, ir.SetItem):
>
> File ~/miniconda3/envs/rapids-22.12/lib/python3.9/site-packages/numba/core/typeinfer.py:1459, in TypeInferer.typeof_assign(self, inst)
> 1458 elif isinstance(value, (ir.Global, ir.FreeVar)):
> 1459 self.typeof_global(inst, inst.target, value)
> 1460 elif isinstance(value, ir.Arg):
>
> File ~/miniconda3/envs/rapids-22.12/lib/python3.9/site-packages/numba/core/typeinfer.py:1559, in TypeInferer.typeof_global(self, inst, target, gvar)
> 1558 try:
> 1559 typ = self.resolve_value_type(inst, gvar.value)
> 1560 except TypingError as e:
>
> File ~/miniconda3/envs/rapids-22.12/lib/python3.9/site-packages/numba/core/typeinfer.py:1480, in TypeInferer.resolve_value_type(self, inst, val)
> 1479 msg = str(e)
> 1480 raise TypingError(msg, loc=inst.loc)
>
> TypingError: Failed in cuda mode pipeline (step: nopython frontend)
> Untyped global name 'Rival': Cannot determine Numba type of <class 'function'>
>
> File "<timed exec>", line 1:
> <source missing, REPL/exec in use?>
>
>
> The above exception was the direct cause of the following exception:
>
> ValueError Traceback (most recent call last)
> File <timed exec>:1
>
> File ~/miniconda3/envs/rapids-22.12/lib/python3.9/contextlib.py:79, in ContextDecorator.__call__.<locals>.inner(*args, **kwds)
> 76 @wraps(func)
> 77 def inner(*args, **kwds):
> 78 with self._recreate_cm():
> 79 return func(*args, **kwds)
>
> File ~/miniconda3/envs/rapids-22.12/lib/python3.9/site-packages/cudf/core/dataframe.py:4369, in DataFrame.apply(self, func, axis, raw, result_type, args, **kwargs)
> 4366 if result_type is not None:
> 4367 raise ValueError("The `result_type` kwarg is not yet supported.")
> 4369 return self._apply(func, _get_row_kernel, *args, **kwargs)
>
> File ~/miniconda3/envs/rapids-22.12/lib/python3.9/contextlib.py:79, in ContextDecorator.__call__.<locals>.inner(*args, **kwds)
> 76 @wraps(func)
> 77 def inner(*args, **kwds):
> 78 with self._recreate_cm():
> 79 return func(*args, **kwds)
>
> File ~/miniconda3/envs/rapids-22.12/lib/python3.9/site-packages/cudf/core/indexed_frame.py:2018, in IndexedFrame._apply(self, func, kernel_getter, *args, **kwargs)
> 2014 kernel, retty = _compile_or_get(
> 2015 self, func, args, kernel_getter=kernel_getter
> 2016 )
> 2017 except Exception as e:
> 2018 raise ValueError(
> 2019 "user defined function compilation failed."
> 2020 ) from e
> 2022 # Mask and data column preallocated
> 2023 ans_col = _return_arr_from_dtype(retty, len(self))
>
> ValueError: user defined function compilation failed.
在RAPIDS cuDF中有一种更好的方法可以利用GPU的并行能力,也可以在Pandas API中使用.loc
完成。由于我无法直接访问您的数据源,我将使用https://www.geeksforgeeks.org/上的NBA数据集。为了便于其他人理解同样的问题,我将使用一个单独的例子。您可以冲洗和重复或组合过滤器适当。请相应调整您的代码
import cudf
import numpy as np
import pandas as pd
import sys
gdf = cudf.read_csv('https://media.geeksforgeeks.org/wp-content/uploads/nba.csv')
print(gdf.head()) # for reference
gdf["event_name"] = 0 # Create your new Column, event_name, sets all values to 0
gdf.loc[gdf["Team"]== "Atlanta Hawks", "event_name"] = 1 # sets event_name to 1 at every instance where Team == Atlanta Hawks
print(gdf.loc[gdf["Team"] != "Atlanta Hawks"]) # see that event_name == 0 for those teams
print(gdf.loc[gdf["Team"] == "Atlanta Hawks"]) # see that event_name == 1 for the Hawks
您可以将库更改为Pandas,并看到它也可以在那里工作。即使在Pandas中,当没有完成所需任务的api时,应用程序也是很棒的。然而,使用API总是最快和最好的方法!:)。还有其他的方法来剥这只猫的皮,但这是我今天为你准备的。