我尝试过这个命令:
#Cleaning Text (RT, Punctuation etc)
#Creating new dataframe and new features
tw_list = pd.DataFrame(tweet_list)
tw_list["text"] = tw_list[0]
#Removing RT, Punctuation etc
remove_rt = lambda x: re.sub('RT @w+: '," ",x)
rt = lambda x: re.sub("(@[A-Za-z0–9]+)|([°-9A-Za-z t])|(w+://S+)"," ",x)
tw_list["text"] = tw_list.text.map(remove_rt).map(rt)
tw_list["text"] = tw_list.text.str.lower()
tw_list.head(10)
我得到了一个错误,称为Traceback(最后一次调用(。下面是的结果
<ipython-input-15-e640b99d08dd> in <module>
8 remove_rt = lambda x: re.sub('RT @w+: '," ",x)
9 rt = lambda x: re.sub("(@[A-Za-z0–9]+)|([⁰-9A-Za-z t])|(w+://S+)"," ",x)
---> 10 tw_list["text"] = tw_list.text.map(remove_rt).map(rt)
11 tw_list["text"] = tw_list.text.str.lower()
12 tw_list.head(10)
c:program filespython39libsite-packagespandascoreseries.py in map(self, arg, na_action)
3907 dtype: object
3908 """
-> 3909 new_values = super()._map_values(arg, na_action=na_action)
3910 return self.constructor(new_values, index=self.index).finalize_(
3911 self, method="map"
c:program filespython39libsite-packagespandascorebase.py in _map_values(self, mapper, na_action)
935
936 # mapper is a function
--> 937 new_values = map_f(values, mapper)
938
939 return new_values
pandas_libslib.pyx in pandas._libs.lib.map_infer()
<ipython-input-15-e640b99d08dd> in <lambda>(x)
7 #Removing RT, Punctuation etc
8 remove_rt = lambda x: re.sub('RT @w+: '," ",x)
----> 9 rt = lambda x: re.sub("(@[A-Za-z0–9]+)|([⁰-9A-Za-z t])|(w+://S+)"," ",x)
10 tw_list["text"] = tw_list.text.map(remove_rt).map(rt)
11 tw_list["text"] = tw_list.text.str.lower()
c:program filespython39libre.py in sub(pattern, repl, string, count, flags)
208 a callable, it's passed the Match object and must return
209 a replacement string to be used."""
--> 210 return _compile(pattern, flags).sub(repl, string, count)
211
212 def subn(pattern, repl, string, count=0, flags=0):
c:program filespython39libre.py in _compile(pattern, flags)
302 if not sre_compile.isstring(pattern):
303 raise TypeError("first argument must be string or compiled pattern")
--> 304 p = sre_compile.compile(pattern, flags)
305 if not (flags & DEBUG):
306 if len(_cache) >= _MAXCACHE:
c:program filespython39libsre_compile.py in compile(p, flags)
762 if isstring(p):
763 pattern = p
--> 764 p = sre_parse.parse(p, flags)
765 else:
766 pattern = None
c:program filespython39libsre_parse.py in parse(str, flags, state)
946
947 try:
--> 948 p = _parse_sub(source, state, flags & SRE_FLAG_VERBOSE, 0)
949 except Verbose:
950 # the VERBOSE flag was switched on inside the pattern. to be
c:program filespython39libsre_parse.py in _parse_sub(source, state, verbose, nested)
441 start = source.tell()
442 while True:
--> 443 itemsappend(_parse(source, state, verbose, nested + 1,
444 not nested and not items))
445 if not sourcematch("|"):
c:program filespython39libsre_parse.py in _parse(source, state, verbose, nested, first)
832 sub_verbose = ((verbose or (add_flags & SRE_FLAG_VERBOSE)) and
833 not (del_flags & SRE_FLAG_VERBOSE))
--> 834 p = _parse_sub(source, state, sub_verbose, nested + 1)
835 if not source.match(")"):
836 raise source.error("missing ), unterminated subpattern",
c:program filespython39libsre_parse.py in _parse_sub(source, state, verbose, nested)
441 start = source.tell()
442 while True:
--> 443 itemsappend(_parse(source, state, verbose, nested + 1,
444 not nested and not items))
445 if not sourcematch("|"):
c:program filespython39libsre_parse.py in _parse(source, state, verbose, nested, first)
596 if hi < lo:
597 msg = "bad character range %s-%s" % (this, that)
--> 598 raise source.error(msg, len(this) + 1 + len(that))
599 setappend((RANGE, (lo, hi)))
600 else:
错误表明错误:字符范围不正确⁰-9在位置18
我正在尝试创建新的数据帧(tw_list(和新的功能(text(,然后使用lambda函数和清除RT、链接、标点符号并转换为小写来清除文本。
由于以下原因,正则表达式中出现问题:"。。。[°-9]"如果你想要这个字符";°";以及"-&";,你需要在"-"以避免regex尝试执行范围操作。
如果它是一个";0";而不是";°";您可以直接替换";°";字符
您可以在此处查看您的问题:https://regex101.com/r/hhf27i/1
以下是解决方法:https://regex101.com/r/8d1VxP/1