在python中,如何获取由于数据类型不匹配而在与panda的合并操作中导致ValueError的列



我有下面的python方法,它接收一个包含两个CSV文件名称的列表,合并它们,并返回一个只有差异的数据帧。一般来说,这种方法工作得很好,但我有一种情况,我会收到以下错误消息:

"ValueError: You are trying to merge on float64 and object columns. If you wish to proceed you should use pd.concat"

我希望能够用try/except块捕捉到这个错误,并将float64列转换为对象数据类型。我可以用下面的代码捕捉到错误,但我不确定如何将列转换为该数据类型,因为列的名称可能会根据接收的CSV文件而更改。有人能帮我吗?我如何获得在这个try/except块上抛出错误的列的引用,并将其转换为对象?或者,有没有更优雅的方法来处理这个错误?

def get_difference(list):
try:
df_1 = pd.read_csv(list[0])
print('column types of df_1')
print(df_1.dtypes.to_string())
df_2 = pd.read_csv(list[1])
print('nncolumn types of df_2')
print(df_2.dtypes.to_string())
data_12 = df_1.merge(df_2, indicator=True, how='outer')
data_12_diff = data_12.loc[lambda x: x['_merge'] != 'both']
data_12_diff = data_12_diff.drop(columns=['_merge'])
data_12_diff=data_12_diff.reset_index(drop=True)
return data_12_diff    
except ValueError:
print('Value Error Exception. Mismatch of types')
"""
Additional code here!
"""
exit()

提前感谢!向致以最良好的问候

克服这一问题的最简单方法是强制执行每列的正确数据类型。如果你从程序一开始就确保数据类型正确,你就不太可能遇到这些问题。

话虽如此,我不久前创建了下面的代码,以准确地解决您面临的问题。定义每一列正确的数据类型,并在需要时转换为它,这并不是100%的防故障,但它可能会解决您的问题:


from __future__ import annotations
from collections import abc
from functools import wraps
from typing import (Any, TypeVar, Callable, Tuple, List, Hashable, Union,
Sequence, Collection)
import numpy as np
import pandas as pd
from numpy import dtype
from pandas._libs.tslibs import OutOfBoundsDatetime  # type: ignore
from pandas._libs.tslibs.timedeltas import array_to_timedelta64  # type: ignore
from pandas.core.dtypes.cast import sanitize_to_nanoseconds  # type: ignore
from pandas.core.dtypes.common import (  # type: ignore
DT64NS_DTYPE,
TD64NS_DTYPE,
ensure_object,
is_bool_dtype,
is_complex_dtype,
is_datetime64_dtype,
is_datetime64tz_dtype,
is_dtype_equal,
is_float_dtype,
is_integer_dtype,
is_object_dtype,
is_string_dtype,
is_timedelta64_dtype,
)
from pandas.core.dtypes.dtypes import DatetimeTZDtype, ExtensionDtype
from pandas.core.dtypes.inference import is_list_like, iterable_not_string
from pandas.errors import IntCastingNaNError  # type: ignore
from pandas.util._exceptions import find_stack_level  # type: ignore
from pandas.errors import MergeError
from pandas.api.extensions import (
register_dataframe_accessor,
register_index_accessor,
register_series_accessor
)
try:
from pandas._typing import ArrayLike, DtypeObj  # type: ignore
except ImportError:
DtypeObj = Union[np.dtype, "ExtensionDtype"]
ArrayLike = Union["ExtensionArray", np.ndarray]  # type: ignore
try:
from types import NoneType
except ImportError:
NoneType = type(None)  # type: ignore
_int8_max = np.iinfo(np.int8).max
_int16_max = np.iinfo(np.int16).max
_int32_max = np.iinfo(np.int32).max
_int64_max = np.iinfo(np.int64).max
NumpyArrayT = TypeVar("NumpyArrayT", bound=np.ndarray)
IndexLabel = Union[Hashable, Sequence[Hashable]]
Axis = Union[str, int]
Axes = Collection[Any]
Shape = Tuple[int, ...]
Suffixes = Tuple[str, str]

def register_dataframe_method(method: Callable):
"""
Register a function as a method attached to the ``Pandas DataFrame``.
This is a wrapper for the ``pandas.core.accessor.AccessorProperty`` class.
**It should be used as a decorator.**
Parameters
----------
method : callable
The function to be registered.
Example
-------
.. code-block:: python
@register_dataframe_method
def print_column(df, col):
'''Print the dataframe column given'''
print(df[col])
df = pd.DataFrame({'col1': [1, 2, 3], 'col2': [4, 5, 6]})
df.print_column('col1')
# [1 2 3]
::
See Also
--------
Official documentation of pandas accessors: 
`https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.core
.accessor.AccessorProperty.html>`_
"""
def inner(*args, **kwargs):
class AccessorMethod(object):
def __init__(self, pandas_obj):
self._obj = pandas_obj
@wraps(method)
def __call__(self, *args, **kwargs):
return method(self._obj, *args, **kwargs)
register_dataframe_accessor(method.__name__)(AccessorMethod)
return method
return inner()

def register_series_method(method: Callable):
"""Register a function as a method attached to the Pandas Series."""
def inner(*args, **kwargs):
class AccessorMethod(object):
__doc__ = method.__doc__
def __init__(self, pandas_obj):
self._obj = pandas_obj
@wraps(method)
def __call__(self, *args, **kwargs):
return method(self._obj, *args, **kwargs)
register_series_accessor(method.__name__)(AccessorMethod)
return method
return inner()

def register_index_method(method: Callable):
"""Register a function as a method attached to the Pandas Index."""
def inner(*args, **kwargs):
class AccessorMethod(object):
__doc__ = method.__doc__
def __init__(self, pandas_obj):
self._obj = pandas_obj
@wraps(method)
def __call__(self, *args, **kwargs):
return method(self._obj, *args, **kwargs)
register_index_accessor(method.__name__)(AccessorMethod)
return method
return inner()

def find_common_type(
types: List[DtypeObj]) -> dtype | dtype | ExtensionDtype | object | Any:
"""Find a common data type among the given dtypes.
Parameters
----------
types : list[DtypeObj]
A list of possible dtypes that can be considered for conversion.
Returns
-------
dtype | dtype | ExtensionDtype | object | Any
pandas extension or numpy dtype.
Raises
------
ValueError
Raises ValueError if :param:`types` is empty.
See Also
--------
The function `numpy.find_common_type <numpy.find_common_type>`_
contains the numpy implementation of this function behavior.
"""
if not types:
raise ValueError("No types given.")
first = types[0]
# Workaround for find_common_type([np.dtype('datetime64[ns]')] * 2) => 
# object
if all(is_dtype_equal(first, t) for t in types[1:]):
return first
# Get unique types (dict.fromkeys is used as order-preserving set())
types = list(dict.fromkeys(types).keys())
if any(isinstance(t, ExtensionDtype) for t in types):
for _type in types:
if isinstance(_type, ExtensionDtype):
res = _type._get_common_dtype(types)
if res is not None:
return res
return np.dtype("object")
# Take the lowest unit
if all(is_datetime64_dtype(t) for t in types):
return np.dtype("datetime64[ns]")
if all(is_timedelta64_dtype(t) for t in types):
return np.dtype("timedelta64[ns]")
# Don't mix bool, int, float or complex
# this is different from what numpy does, when it casts bools represented
# by floats or ints to int
has_bools = any(is_bool_dtype(t) for t in types)
if has_bools:
for t in types:
if is_integer_dtype(t) or is_float_dtype(t) or is_complex_dtype(t):
return np.dtype("object")
return np.find_common_type(types, [])

def normalize_dtypes(xdf: pd.DataFrame, ydf: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
"""
Normalize the underlying column dtypes of two input dataframes.
Function determines which columns exist on both dataframes, and only
converts those ``pd.Series`` of values to the same dtype.
:func:`find_common_type` finds the common data type supported to column
on both input dataframes.
Parameters
----------
xdf : pd.DataFrame
The first dataframe to be updated.
ydf : pd.DataFrame
The second dataframe.
Returns
-------
tuple[pd.DataFrame, pd.DataFrame]
The two dataframes with the same data types.
"""
return _normalize_dtypes(xdf, ydf, [*xdf.columns, *ydf.columns])

def _normalize_dtypes(xdf: pd.DataFrame, ydf: pd.DataFrame,
use_cols: abc.Iterable) -> tuple[
pd.DataFrame, pd.DataFrame]:
"""
Normalize the underlying column dtypes of the given dataframes.
Function finds columns in common (by column name), using each
dataframe's columns and the user-provided list of columns to normalize
data type. This dispatch won't normalize the data types of all the columns
with matching names.
Parameters
----------
xdf : pd.DataFrame
The first dataframe to be updated.
ydf : pd.DataFrame
The second dataframe.
use_cols : abc.Iterable
User-defined list of column names to normalize.
Returns
-------
tuple[pd.DataFrame, pd.DataFrame]
The two dataframes with the same data types.
.. versionchanged:: 1.0.3
Columns that are converted to string, now have any trailing '.0' removed.
"""
int_dtypes = ['int8', 'int16', 'int32', 'int64', 'uint8', 'uint16',
'uint32', 'uint64']
float_dtypes = ['float16', 'float32', 'float64']
bool_dtypes = ['bool']
object_dtypes = ['object']
datetime_dtypes = ['datetime64[ns]', 'datetime64[tz]', 'datetime64']
string_dtypes = ['string', 'str']
if not iterable_not_string(use_cols):
use_cols = [use_cols]
cols_to_consider = list(set(xdf.columns) & set(use_cols) & set(ydf.columns))
astype_dict = {}
for col in cols_to_consider:
xdtype = xdf[col].dtype
ydtype = ydf[col].dtype
common_type = find_common_type([xdf[col].dtype, ydf[col].dtype])
if common_type == object:
for dtyp in [xdtype, ydtype]:
if dtyp in [*int_dtypes, *float_dtypes]:
common_type = 'float64'
if common_type == 'float64':
try:
xdf[col].astype(common_type)
ydf[col].astype(common_type)
except ValueError:
common_type = 'str'
astype_dict[col] = str if common_type == object else common_type
xdf, ydf = xdf.astype(astype_dict), ydf.astype(astype_dict)
str_cols = [k for k, v in astype_dict.items() if v in string_dtypes]
for col in str_cols:
xdf[col] = xdf[col].str.replace('.0', '')
ydf[col] = ydf[col].str.replace('.0', '')
return xdf, ydf

def maybe_make_list(obj: Any) -> Any | list[Any]:
"""
Convert ``obj`` to ``list``, if ``obj`` is ``Iterable``,
and more specifically a ``string``.
Function works similarly to :func:`iterable_not_string`, but instead of
returning True or False, it returns the object as a list, whenever it is
possible.
Parameters
----------
obj : Any
The object to be evaluated and possibly converted to list.
Returns
-------
Any | list[Any]
The original object, or a list of the original object.
Examples
--------
>>> x = '12345'
>>> list(x)
['1', '2', '3', '4', '5']
>>> maybe_make_list(x)
['12345']
>>> maybe_make_list(None)
>>> maybe_make_list(['1', '2', '3'])
['1', '2', '3']
"""
if obj is not None and not isinstance(obj, (tuple, list)):
return [obj]
return obj

@register_dataframe_method
def safe_merge(
left: pd.DataFrame,
right: Union["DataFrame", "Series"],
how: str = "inner",
on: IndexLabel | None = None,
left_on: IndexLabel | None = None,
right_on: IndexLabel | None = None,
left_index: bool = False,
right_index: bool = False,
sort: bool = False,
suffixes: Suffixes = ("_x", "_y"),
copy: bool = True,
indicator: bool = False,
validate: str | None = None,
allow_inflation: bool = True,
) -> pd.DataFrame:
"""Merge two DataFrames, making sure merge keys have the same data type.
Parameters
----------
left : pd.DataFrame
right : Union["DataFrame", "Series"]
how : str {‘left’, ‘right’, ‘outer’, ‘inner’, ‘cross’}, default ‘inner’
The type of merge to be performed.
* 'left': use only keys from the left frame, like an SQL left
outer join; preserve key order.
* 'right': use only keys from right frame, like an SQL right
outer join; preserve key order.
* 'outer': use union of keys from both frames, like an SQL
full outer join; sort keys lexicographically.
* 'inner': use intersection of keys from both frames, like an
SQL inner join; preserve the order of the left keys.
* 'cross': creates the cartesian product from both frames,
preserves the order of the left keys.
on : IndexLabel | None, default None
Column or index level names to join on. These must be found in both
DataFrames. If on is None and not merging on indexes then this defaults
to the intersection of the columns in both DataFrames.
left_on : IndexLabel | None, default None
Column or index level names to join on in the left DataFrame. Can also
be an array or list of arrays of the length of the left DataFrame.
These arrays are treated as if they are columns.
right_on : IndexLabel | None = None
Column or index level names to join on in the right DataFrame. Can
also be an array or list of arrays of the length of the right
DataFrame. These arrays are treated as if they are columns.
left_index : bool, default False
Use the index from the left DataFrame as the join key(s). If it is a
MultiIndex, the amount of keys in the other DataFrame (either the
index or several columns) must match the amount of levels.
right_index : bool, default False
Use the index from the right DataFrame as the join key. Same caveats
as left_index
sort : bool, default False
Sort the join keys lexicographically in the result DataFrame. If
False, the order of the join keys depends on the join type (how
keyword)
suffixes : Suffixes, default is (“_x”, “_y”)
A length-2 sequence where each element is optionally a string
indicating the suffix to add to overlapping column names in left and
right respectively. Pass a value of None instead of a string to
indicate that the column name from left or right should be left
as-is, with no suffix. At least one of the values must not be None
copy : bool, default True
If False, avoid copy if possible
indicator : bool | str, default False
If True, adds a column to the output DataFrame called “_merge” with
information on the source of each row. The column can be given a
different name by providing a string argument. The column will have a
Categorical type with the value of “left_only” for observations whose
merge key only appears in the left DataFrame, “right_only” for
observations whose merge key only appears in the right DataFrame,
and “both” if the observation's merge key is found in both DataFrames
validate : str | None, optional
If specified, checks if merge is of specified type.
* "one_to_one" or "1:1": check if merge keys are unique in both
left and right datasets.
* "one_to_many" or "1:m": check if merge keys are unique in the
left dataset.
* "many_to_one" or "m:1": check if merge keys are unique in right
dataset.
* "many_to_many" or "m:m": allowed, but does not result in checks.
allow_inflation : bool, default False
If “False”, allows for merge to duplicates rows. Otherwise,
drops duplicates after the following rules:
* If how == "left": drop duplicate keys from :param:`right`
dataframe.
* If how == "right": drop duplicate keys from :param:`left`
dataframe.
Returns
-------
pd.DataFrame
A DataFrame of the two merged objects.
Raises
------
MergeError
* If :param:`on` and :param:`left_on` or :param:`right_on` are defined.
* If :param:`on` and :param:`left_index`, :param:`right_index` defined.
* If :param:`left_on` and :param:`right_index` are defined,
or :param:`right_on` and :param:`left_index` are defined.
ValueError
* If :param:`left_on` and :param:`right_on` are defined, but both
attributes do not contain the same amount of columns.
Examples
--------
>>> df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2'], 'B': [20, 30, 40]})
>>> df2 = pd.DataFrame({'a': ['A0', 'A1', 'A3'], 'b': [20, 50, 40]})
>>> df1.safe_merge(df2, left_on=['A', 'B'], right_on=['a', 'b'],
... how='inner')
A   B   a   b
0   A0  20  A0  20
>>> df2 = pd.DataFrame({'a': ['A0', 'A1', 'A3'], 'b': ['20', '50', '40']})
>>> df1.merge(
... df2, left_on=['A', 'B'], right_on=['a', 'b'], how='inner'
... ) #  doctest: +ELLIPSIS
Traceback (most recent call last):
...
ValueError: You are trying to merge on int64 and object columns...
>>> df1 = pd.DataFrame({'a': ['A0', 'A1', 'A2'], 'b': [20, 30, 30]})
>>> df2 = pd.DataFrame({'A': ['A0', 'A1', 'A3'], 'B': ['20', '30', '40']})
>>> df1.safe_merge(df2, left_on=['a', 'b'], right_on=['A', 'B'],
... how='inner')
a   b   A   B
0   A0  20  A0  20
1   A1  30  A1  30
"""
if on is not None:
on = maybe_make_list(on)
if any([left_on, right_on]):
MergeError(
'Can only pass argument "on" OR "left_on" and "right_on", '
'' "not a combination of both.")
left_on = right_on = on
if left_on is not None and right_on is not None:
left_on, right_on = maybe_make_list(left_on), maybe_make_list(right_on)
if len(left_on) != len(right_on):
raise ValueError("len(right_on) must equal len(left_on)")
if any([left_index, right_index]):
raise MergeError(
'Can only pass argument "left_on" and "right_on"'
' OR "left_index" and "right_index", not a combination of both.'
)
right = right.rename(columns=dict(zip(right_on, left_on)))
left, right = normalize_dtypes(left, right)
right = right.rename(columns=dict(zip(left_on, right_on)))
if not validate and not allow_inflation:
validate = get_validation_relationship(how)
if validate:
left, right = eval_validation(left, right, left_on, right_on,
validate)
return left.merge(
right,
how=how,  # type: ignore
left_on=left_on,
right_on=right_on,
sort=sort,
suffixes=suffixes,
copy=copy,
indicator=indicator,
validate=validate,  # type: ignore
)
if all([left_index, right_index]):
return left.merge(
right,
how=how,  # type: ignore
left_index=left_index,
right_index=right_index,
sort=sort,
suffixes=suffixes,
copy=copy,
indicator=indicator,
validate=validate,  # type: ignore
)
raise ValueError("Unsupported argument combination")

def get_validation_relationship(how: str) -> str:
"""Return the validation relationship for the given :param:`how`.
Parameters
----------
how : str {'inner', 'left', 'right'}
The merge method, being used.
Returns
-------
str
The validation relationship to be used. Available options are:
* 'many_to_one' : when :param:`how` is 'left'
* 'one_to_many' : when :param:`how` is 'right'
* 'one_to_one' : when :param:`how` is 'inner'
Raises
------
ValueError
If :param:`how` is not one of 'inner', 'left', 'right'.
MergeError
If :param:`how` is 'left' or 'right' and the validation relationship
is 'one_to_many' or 'many_to_one'.
Notes
-----
This function does not support 'outer' as a valid :param:`how` value.
However, the method ``pd.DataFrame.merge`` does support 'outer' as a valid
:param:`how` value (but there aren't many usages on the project's source
code).
"""
if how == "left":
return "many_to_one"
if how == "right":
return "one_to_many"
if how == "inner":
return "one_to_one"
raise ValueError(f'Unsupported "how" argument {how}')

def eval_validation(
left: pd.DataFrame,
right: pd.DataFrame,
left_on: IndexLabel,
right_on: IndexLabel,
validate: str,
):
"""
Evaluate the validation relationship between `left` and `right`, ensuring
that the `left` and `right` `dataframes` are compatible with the validation
strategy.
Use :func:`get_validation_relationship` to get the validation relationship,
based on the type of merger being performed.
Parameters
----------
left : pd.DataFrame
Left dataframe
right : pd.DataFrame
Right dataframe
left_on : IndexLabel
Left index label
right_on : IndexLabel
Right index label
validate : str {'one_to_one', 'one_to_many', 'many_to_one'}
Validation strategy.
Returns
-------
tuple[pd.DataFrame, pd.DataFrame]
Left and right dataframes after validation.
Raises
------
ValueError
If the validation strategy is not supported.
Notes
-----
The function ``map``, can be used to apply functions to ``Iterables``.
``Iterables`` are objects that have a ``__len__`` attribute. These include,
for example, lists, tuples, strings, bytes, and bytearrays. You can also
define your own iterables classes, by defining the ``__len__`` attribute.
"""
if validate == "many_to_one":
right = right.drop_dup_and_log(subset=right_on)
elif validate == "one_to_many":
left = left.drop_dup_and_log(subset=left_on)
elif validate == "one_to_one":
# map can be used to apply functions to an iterable (e.g. list)
left, right = map(
lambda args: args[0].drop_duplicates(subset=args[1]),
tuple(zip([left, right], [left_on, right_on])),
)
else:
raise ValueError(f'Unsupported "validate" argument {validate}')
return left, right

# == Example ======================
df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2'], 'B': [20, 30, 40]})
df2 = pd.DataFrame({'a': ['A0', 'A1', 'A3'], 'b': ['20', '50', '40']})
df1.safe_merge(df2, left_on=['A', 'B'], right_on=['a', 'b'], how='inner')
# Prints:
#     A     B   a     b
# 0  A0  20.0  A0  20.0
# Using normal "merge" operation":
df1.merge(df2, left_on=['A', 'B'], right_on=['a', 'b'], how='inner')
# ...
# ValueError: You are trying to merge on int64 and object columns. If you wish to proceed you should use pd.concat

示例

上面的代码定义了一个名为safe_merge的自定义panda数据帧方法,您可以使用与普通pandas.DataFrame.merge方法相同的方法(事实上,它使用了实际pandas.DataFrame.merge源代码的80%(。

这里有一个它可以做什么的例子:

df1 = pd.DataFrame({'A': ['A0', 'A1', 'A2'], 'B': [20, 30, 40]})
df2 = pd.DataFrame({'a': ['A0', 'A1', 'A3'], 'b': ['20', '50', '40']})
df1.safe_merge(df2, left_on=['A', 'B'], right_on=['a', 'b'], how='inner')
# Prints:
#     A     B   a     b
# 0  A0  20.0  A0  20.0

以下是使用普通pandas.DataFrame.merge:的相同示例

# Using normal "merge" operation":
# df1.merge(df2, left_on=['A', 'B'], right_on=['a', 'b'], how='inner')
# ...
# ValueError: You are trying to merge on int64 and object columns. If you wish to proceed you should use pd.concat

感谢您的详细回答Ingwersen_erik。我最终使用了一个不那么复杂和有限的解决方案,但它似乎解决了我的问题。低于代码

def get_difference(list):
try:
df_1 = pd.read_csv(list[0])
df_2 = pd.read_csv(list[1])
data_12 = df_1.merge(df_2, indicator=True, how='outer')
data_12_diff = data_12.loc[lambda x: x['_merge'] != 'both']
data_12_diff = data_12_diff.drop(columns=['_merge'])
data_12_diff=data_12_diff.reset_index(drop=True)
return data_12_diff
except ValueError:
print('Value Error Exception. Mismatch of column types')
lista_de_colunas_comuns=common_member(df_1.columns.values, df_2.columns.values)
if lista_de_colunas_comuns:
for cada_coluna in lista_de_colunas_comuns:
if (df_1[cada_coluna].dtypes != df_2[cada_coluna].dtypes):
df_1[[cada_coluna]] = df_1[[cada_coluna]].astype(df_2[cada_coluna].dtypes)
data_12 = df_1.merge(df_2, indicator=True, how='outer')
data_12_diff = data_12.loc[lambda x: x['_merge'] != 'both']
data_12_diff = data_12_diff.drop(columns=['_merge'])
data_12_diff = data_12_diff.reset_index(drop=True)
return data_12_diff
def common_member(a, b):
a_set = set(a)
b_set = set(b)
if (a_set & b_set):
print(type(a_set & b_set))
return a_set & b_set
else:
x=set()
return x