我写了下面的代码。我的句子是Twitter的一部分。我想从列表中删除所有表情符号,但我的表情符号功能不起作用。为什么?
我还想删除用户。用户从句子的开头开始,但有时它会保留用户,有时它会删除用户。我的标点符号也不起作用,我评论了它。我该如何解决这个问题?
import spacy, re
nlp = spacy.load('en')
stop_words = [w.lower() for w in stopwords.words()]
def sanitize(input_string):
""" Sanitize one string """
# Remove emoji
emoji_pattern = re.compile("["
u"U0001F600-U0001F64F" # emoticons
u"U0001F300-U0001F5FF" # symbols & pictographs
u"U0001F680-U0001F6FF" # transport & map symbols
u"U0001F1E0-U0001F1FF" # flags (iOS)
"]+", flags=re.UNICODE)
string = emoji_pattern.sub(r'', input_string) # No emoji
# Normalize to lowercase
string = input_string.lower()
# Spacy tokenizer
string_split = [token.text for token in nlp(string)]
# In case the string is empty
if not string_split:
return ''
# Remove user
# Assuming user is the first word and contains an @
if '@' in string_split[0]:
del string_split[0]
# Join back to string
string = ' '.join(string_split)
# Remove # and @
for punc in '":!@#':
string = string.replace(punc, '')
# Remove 't.co/' links
string = re.sub(r'http//t.co/[^s]+', '', string, flags=re.MULTILINE)
# Removing stop words
string = ' '.join([w for w in string.split() if w not in stop_words])
#Punctuation
# string = [''.join(w for w in string.split() if w not in string.punctuation) for w in string]
# return string
#list = ['@cosmetic_candy I think a lot of people just enjoy being a pain in the ass on there',
'Best get ready sunbed and dinner with nana today :)',
'@hardlyin70 thats awesome!',
'Loving this weather',
'“@danny_boy_37: Just seen an absolute idiot in shorts! Be serious!” Desperado gentleman',
'@SamanthaOrmerod trying to resist a hardcore rave haha! Resisting towns a doddle! Posh dance floor should wear them in quite easy xx',
'59 days until @Beyonce!!! Wooo @jfracassini #cannotwait',
'That was the dumbest tweet I ever seen',
'Oh what to do on this fine sunny day?',
'@Brooke_C_X hows the fish ? Hope they r ok. Xx',
'@Jbowe_ 😠',
'Or this @louise_munchi',
'@guy_clifton your diary is undoubtedly busier than mine, but feel free to check ',
'Willy🐟👌⚽']
list_sanitized = [sanitize(string) for string in list]
list_sanitized[:50]
我在这里借鉴了其他一些 SO 答案:
- 删除文本表情符号:https://stackoverflow.com/a/61758471/42346
- 删除图形表情符号:https://stackoverflow.com/a/50602709/42346
这也将删除字符串中出现的任何Twitter用户名。
import emoji
import spacy
import stop_words
nlp = spacy.load('en_core_web_sm')
stopwords = [w.lower() for w in stop_words.get_stop_words('en')]
emoticon_string = r"""
(?:
[<>]?
[:;=8] # eyes
[-o*']? # optional nose
[)]([dDpP/:}{@|\] # mouth
|
[)]([dDpP/:}{@|\] # mouth
[-o*']? # optional nose
[:;=8] # eyes
[<>]?
)"""
def give_emoji_free_text(text):
return emoji.get_emoji_regexp().sub(r'', text)
def sanitize(string):
""" Sanitize one string """
# remove graphical emoji
string = give_emoji_free_text(string)
# remove textual emoji
string = re.sub(emoticon_string,'',string)
# normalize to lowercase
string = string.lower()
# spacy tokenizer
string_split = [token.text for token in nlp(string)]
# in case the string is empty
if not string_split:
return ''
# join back to string
string = ' '.join(string_split)
# remove user
# assuming user has @ in front
string = re.sub(r"""(?:@[w_]+)""",'',string)
#remove # and @
for punc in '":!@#':
string = string.replace(punc, '')
# remove 't.co/' links
string = re.sub(r'http//t.co/[^s]+', '', string, flags=re.MULTILINE)
# removing stop words
string = ' '.join([w for w in string.split() if w not in stopwords])
return string