如何从twitter流式JSON响应中分离文本，并使用python对文本进行分析

我正在尝试使用twitter API对文本进行情绪分析。我遇到的问题是，我不理解如何将文本从每条推文中分离出来，也不理解TextBlob库中提供的情感极性分析。更进一步，我希望这只是收回英文推特。输出为JSON格式。

以下是基于关键字(在本例中为"usd"、"euro"、"loonie")生成推文的代码，以及我存储文本并在变量中使用结果的蹩脚尝试：

from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
import json
import re
import pandas as pd
import matplotlib.pyplot as plt

#Variables that contains the user credentials to access Twitter API 
access_token = "xxxx"
access_token_secret = "xxxx"
consumer_key = "xxxx"
consumer_secret = "xxxx"

#This is a basic listener that just prints received tweets to stdout.
class StdOutListener(StreamListener):
def on_data(self, data):
print data
return True
def on_error(self, status):
print status

if __name__ == '__main__':
#This handles Twitter authentication and the connection to Twitter Streaming API
l = StdOutListener()
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
stream = Stream(auth, l)
#This line filter Twitter Streams to capture data by the keywords: 'python', 'javascript', 'ruby'
stream.filter(track=['euro', 'dollar', 'loonie', ] )
tweets_data_path = stream.filter
tweets_data = []
tweets_file = open(tweets_data_path, "r")
for line in tweets_file:
try:
tweet = json.loads(line)
tweets_data.append(tweet)
except:
continue
print len(tweets_data)
tweets['text'] = map(lambda tweet: tweet['text'], tweets_data)
wiki = TextBlob(tweets['text'])
r = wiki.sentiment.polarity
print r

这就是输出的样子：

｛"created_at"："Sun Jun 14 23:43:31+0000 2015"，"id"：610231121016524801，"id_str"："6102311210165 24801"，"text"："RT@amirulimann：RM6 diperlukan utk tukar kpd 1磅。\nRM3 diperlucan utk tukal kpd 1S'美元。\n\nRaf matawang jatuh.Tak sedih ke？htt\u2026"，"source"："\u003ca href=\"http://twitter.com/download/iphone\"rel=\"nofollow\"\u003eTwitter for iPhone\u003c/a\u003e"，"truncated"：false，"in_reply_to_status_id"：null，"in_reply_to-status_id_str"：null、"in_reply_to_user_id"：null，"user"：｛"id"：42642877，"id_str"："42642877"，"name"："Wny"，"screen_name"："waaannyyy"，"location"："Dirgahayu Darul Makmur"，"url"：null，"description"："Aku serba tiada，Aku kekurangan。"，"protected"：false，"verified"：false，"followers_count"：320，"friends_count"：239，"listed_count"：1，"preferentes_count"：4344，"statuses_count"：34408，"created_at"："2009年5月26日星期二15:10:28+000"，"utc_offset"：28800，"time_zone"："Kuala Lumpur"，"geo_enabled"：true，"lang"："en"，"contributors_enabled"：false，"is_translator"：false"profile_background_color"："FFFFFFFFFFFF"，"profile_background_image_url"："http://pbs.twimg.com/profile_background_images/433201191825047553/PM76m-v2.jpeg"，"profile_background_image_url_https"："https://pbs.twimg.com/profile_background_images/433201191825047553/PM76m-v2.jpeg"，"profile_background_tile"：true，"profil_link_color"："DD2E44"，"profile _ sidebar_order_colorhttp://pbs.twimg.com/profile_images/609402965795835904/mm6jjRRO_normal.jpg"，"profile_image_url_https"："https://pbs.twimg.com/profile_images/609402965795835904/mm6jjRRO_normal.jpg"，"profile_banner_url"："https://pbs.twimg.com/profile_banners/42642877/1415486321"，"default_profile"：false，"default.profile_image"：false、"follow_requestrongent"：null、"notifications"：null｝、"geo"：null，"coordinates"：null"place"：null；"contributors"：null。"retweeted_status"：｛"created_at"："Sat Jun 13 03:33:29+0000 2015"、"id"：609564219495706624、"id_str"："609564219496706624"、"text"："RM6需要1英镑。\nRM3需要额外的1美元。\n\nGraf matawang jatuh。德？http://t.co/dum4skb6uK"，"source"："\u003ca href=\"http://twitter.com/download/android\"rel=\"nofollow\"\u003eTwitter for Android\u003c/a\u003e"，"truncated"：false，"in_reply_to_status_id"：null，"in_reply_to-status_id_str"：null、"in_repli_to_user_id"：null；"in_re普利_to_user_id_str"：null；"in_reply_to_screen_name"：null imann"，"location"："+06车型年款"，"url"："http://instagram.com/amirulimannn"，"description"："我想淹死在她的香水瓶里"，"protected"：false，"verified"：false"，"followers_count"：723"，"friends_count"：834"，"listed_count"：2"，"preferentes_count":4810"，"statuses_count":50981"，"created_at"："2012年2月3日星期五07:49:55+00000"，"utc_offset"：28800"，"time_zone"："Kuala Lumpur"，"geo_enabled"：true"，"lang"："en"，"contributors_enabled"：false"is _translator"：false，"profile_background_color"："AD0A20"，"profile_bbackground_image_url"："http://pbs.twimg.com/profile_background_images/378800000139426816/61DHBnYy.jpeg"，"profile_background_image_url_https"："https://pbs.twimg.com/profile_background_images/378800000139426816/61DHBnYy.jpeg"，"profile_background_tile"：false，"profil_link_color"："E36009"，"profile _ sidebar_order_colorhttp://pbs.twimg.com/profile_images/592744790283911169/dW7S73WA_normal.jpg"，"profile_image_url_https"："https://pbs.twimg.com/profile_images/592744790283911169/dW7S73WA_normal.jpg"，"profile_banner_url"："https://pbs.twimg.com/profile_banners/481856658/1428379855"，"default_profile"：false，"default.profile_image"：false、"follow_requestrongent"：null、"notifications"：null｝、"geo"：null，"coordinates"：null"place"：null；"contributors"：null。"retweet_count"：1321，"favorite_count"：229，"entities"：｛"hashtags"：[]、"trends"：[]，"urls"：[]；"user_mentations"：[]。"symbol"：[]"media"：[｛"id"：609564142886760448，"id_str"："609564142886760448"，"索引"：[118140]，"media_url"："http://pbs.twimg.com/media/CHWbW7yUsAAyAEw.jpg"，"media_url_https"："https://pbs.twimg.com/media/CHWbW7yUsAAyAEw.jpg"，"url"："http://t.co/dum4skb6uK"，"display_url"："pic.twitter.com/dump4skb6uK"，"expanded_url"："http://twitter.com/amirulimannn/status/609564219495706624/photo/1"，"type"："photo"，"sizes"：{"small"：｛"w"：340"，"h"：340，"resize"："fit"｝，"thumb"：{"w"：150"，"h:150"，"resize"："crop id_str"："609564142886760448"，"索引"：[118140]，"media_url"："http://pbs.twimg.com/media/CHWbW7yUsAAyAEw.jpg"，"media_url_https"："https://pbs.twimg.com/media/CHWbW7yUsAAyAEw.jpg"，"url"："http://t.co/dum4skb6uK"，"display_url"：">

from tweepy.streaming import StreamListener
from tweepy import OAuthHandler
from tweepy import Stream
import json
# Variables that contains the user credentials to access Twitter API
access_token = ''
access_token_secret = ''
consumer_key = ''
consumer_secret = ''

# This is a basic listener that just prints received tweets to stdout.
class StdOutListener(StreamListener):
def on_data(self, data):
json_load = json.loads(data)
texts = json_load['text']
coded = texts.encode('utf-8')
s = str(coded)
print(s[2:-1])
return True
def on_error(self, status):
print(status)
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)
stream = Stream(auth, StdOutListener())
# This line filter Twitter Streams to capture data by the keywords: 'python', 'javascript', 'ruby'
stream.filter(track=['euro', 'dollar', 'loonie', ], languages=['en'])

对于您最初关于json的问题：您可以使用json.loads()加载数据流。当您将数据从twitter提取到python上时，其他事情的原因是不会出现charmap错误。s[2:-1]的原因是为了消除编码到utf-8的额外字符。

对于纯英语的推文，您也可以使用languages=['en']直接从流中筛选。

我不熟悉TextBlob库，但您可以通过多种方式存储它，只需将信息写入文件中，运行TextBlob时直接从文件中读取即可。您可以替换print(s[2:-1])或添加：

myfile = open('text.csv','a')
myFile.write(s[2:-1])
myFile.write('n') # adds a line between tweets
myFile.close()

你可以使用file = open('text.csv', 'r')来进行情绪分析。打开文件时，不要忘记添加file.close()。

相关内容

最新更新

热门标签：