df2 = pd.DataFrame(pd.read_csv("file.csv", delimiter=';', header=None, skiprows=1, engine='python', names=['I', 'II', 'III']))
df2["COMBINED"] = df2["I"].astype(str) + df2["II"].astype(str) + df2["III"].astype(str)
df2 = df2['COMBINED'].replace({'$': '', ',': ''}, regex=True).str.lower()
df2 = [ nltk.word_tokenize( str(COMBINED) ) for COMBINED in df2 ]
a = df2.apply(set)
属性错误:"list"对象没有属性"apply">
df2 = pd.DataFrame(pd.read_csv("file.csv", delimiter=';', header=None, skiprows=1, names=['I', 'II', 'III']))
df2["COMBINED"] = df2["I"].astype(str) + df2["II"].astype(str) + df1["III"].astype(str)
df2["COMBINED"] = df1["COMBINED"].str.replace(r'[^ws]+', '')
df2 = df2.COMBINED.apply(nltk.word_tokenize)
df2 = df2.apply(lambda x: [item.lower() for item in x if item.lower() not in stop_words])
a = df2.apply(set)
属性错误:"系列"对象没有属性"交集">
有人知道如何解决这些问题吗?我想在两个带字符串的数据帧之间生成一个点积,即每一行与另一个df的每一行。
在pd.read_csv()
之后不需要pd.DataFrame()
,pd.read_csv()
的返回类型已经是一个数据帧。
df2 = df2['COMBINED'].replace({'$': '', ',': ''}, regex=True).str.lower()
# ^
# |
# Here df2 is a Series, from the original COMBINED column
df2 = [ nltk.word_tokenize( str(COMBINED) ) for COMBINED in df2 ]
# ^ ^
# | |
# Here df2 is a list Each element in df2 Series
a = df2.apply(set)
列表肯定没有属性apply
。
df2 = pd.read_csv("file.csv", delimiter=';', header=None, skiprows=1, names=['I', 'II', 'III'])
# ^
# |
# df2 has columns I, II, III
df2["COMBINED"] = df2["I"].astype(str) + df2["II"].astype(str) + df1["III"].astype(str)
# ^
# |
# You create a new column combined,
# df2 now has columns I, II, III, COMBINED
df2["COMBINED"] = df1["COMBINED"].str.replace(r'[^ws]+', '')
# ^
# |
# Do operations on COMBINED column
df2 = df2.sentence.apply(nltk.word_tokenize)
# ^
# |
# By using df2.sentence you are accessing sentence column,
# there is no sentence column, only columns I, II, III, COMBINED