我是编程和NLP的新手。我在此网站上找到了一些代码:( https://towardsdatascience.com/creating-the-twitter-sentiment-sentiment-analysis-program-inprogram-in-python-with-neive-neive-nive-classification-672E5589A7ED(用于情感分析在Twitter上。我有我需要的CSV文件,因此,不用构建它们,而只是通过文件定义了变量。
当我尝试运行代码时,运行此行时会给我一个类型错误:
preprocessedtrainingset = tweetProcessor.processtweets(triberdata(
并跟踪回到行:
processedtweets.append(((self._processtweet(tweet [" text"]((,tweet [" label"]((。
我不知道如何绕过问题,并且仍然保持代码的核心功能。
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from string import punctuation
from nltk.corpus import stopwords
import twitter
import csv
import time
import nltk
nltk.download('stopwords')
testDataSet = pd.read_csv("Twitter data.csv")
print(testDataSet[0:4])
trainingData = pd.read_csv("full-corpus.csv")
print(trainingData[0:4])
class PreProcessTweets:
def __init__(self):
self._stopwords = set(stopwords.words('english') + list(punctuation) + ['AT_USER','URL'])
def processTweets(self, list_of_tweets):
processedTweets=[]
for tweet in list_of_tweets:
processedTweets.append((self._processTweet(tweet["text"]),tweet["label"]))
return processedTweets
def _processTweet(self, tweet):
tweet = tweet.lower() # convert text to lower-case
tweet = re.sub('((www.[^s]+)|(https?://[^s]+))', 'URL', tweet) # remove URLs
tweet = re.sub('@[^s]+', 'AT_USER', tweet) # remove usernames
tweet = re.sub(r'#([^s]+)', r'1', tweet) # remove the # in #hashtag
tweet = word_tokenize(tweet) # remove repeated characters (helloooooooo into hello)
return [word for word in tweet if word not in self._stopwords]
tweetProcessor = PreProcessTweets()
preprocessedTrainingSet = tweetProcessor.processTweets(trainingData)
preprocessedTestSet = tweetProcessor.processTweets(testDataSet)
我希望它能开始清洁我发现的数据,然后才能开始使用幼稚的贝叶斯
没有您的实际数据很难说,但我认为您彼此相互困惑。
- 加载CSV-DATA时,您正在制作PANDAS DataFrame。
- 然后,在ProcessTweets方法中,您正在尝试像列表一样循环浏览此数据框架。
- 终于,在访问列表值的processTweets的福音循环中,您称之为" tweet"的列表,您正在尝试使用键'text'和label访问'tweet'的值'。但是,我怀疑您在那里有一个字典。
我从此网站下载了一些推文。有了这些数据,我测试了您的代码并进行了以下调整。
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from string import punctuation
from nltk.corpus import stopwords
import nltk
#had to install 'punkt'
nltk.download('punkt')
nltk.download('stopwords')
testDataSet = pd.read_csv("data.csv")
# For testing if the code works I only used a TestDatasSet, and no trainingData.
class PreProcessTweets:
def __init__(self):
self._stopwords = set(stopwords.words('english') + list(punctuation) + ['AT_USER','URL'])
# To make it clear I changed the parameter to df_of_tweets (df = dataframe)
def processTweets(self, df_of_tweets):
processedTweets=[]
#turning the dataframe into lists
# in my data I did not have a label, so I used sentiment instead.
list_of_tweets = df_of_tweets.text.tolist()
list_of_sentiment = df_of_tweets.sentiment.tolist()
# using enumerate to keep track of the index of the tweets so I can use it to index the list of sentiment
for index, tweet in enumerate(list_of_tweets):
# adjusted the code here so that it takes values of the lists straight away.
processedTweets.append((self._processTweet(tweet), list_of_sentiment[index]))
return processedTweets
def _processTweet(self, tweet):
tweet = tweet.lower() # convert text to lower-case
tweet = re.sub('((www.[^s]+)|(https?://[^s]+))', 'URL', tweet) # remove URLs
tweet = re.sub('@[^s]+', 'AT_USER', tweet) # remove usernames
tweet = re.sub(r'#([^s]+)', r'1', tweet) # remove the # in #hashtag
tweet = word_tokenize(tweet) # remove repeated characters (helloooooooo into hello)
return [word for word in tweet if word not in self._stopwords]
tweetProcessor = PreProcessTweets()
preprocessedTestSet = tweetProcessor.processTweets(testDataSet)
tweetProcessor = PreProcessTweets()
print(preprocessedTestSet)
希望它有帮助!