我的代码在使用python 3.6.1从json读取Twitter数据集时遇到错误



这是代码:

import json
import re
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO-]? # Nose (optional)
        [D)](]/\OpP] # Mouth
    )"""
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[w_]+)', # @-mentions
    r"(?:#+[w_]+[w'_-]*[w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*(),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
    r'(?:(?:d+,?)+(?:.?d+)?)', # numbers
    r"(?:[a-z][a-z'-_]+[a-z])", # words with - and '
    r'(?:[w_]+)', # other words
    r'(?:S)' # anything else
]
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)
def tokenize(s):
    return tokens_re.findall(s)
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens
with open('mytweets.json',  mode='r', encoding='utf-8') as f:
    for line in f:
        #line = f.readline()
        tweet = json.loads(line) 
    print(preprocess(tweet['text'])) 

运行后显示问题: 运行代码后出现问题

问题的解决方案是什么?如何成功读取数据并从 json 格式标记推文?

以下是mytweets.json的一些示例

{"created_at":"Thu Jun 22 21:50:18 +0000 2017","id":878007261674602496,"id_str":"878007261674602496","text":"RT @wreckitroy: Well, I like dick, so I don't see this as a possibility, but thanks for trying to reach that far up my ass to try tu2026 ","source":"u003ca href="http://twitter.com/download/iphone" rel="nofollow"u003eTwitter for iPhoneu003c/au003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":632645991,"id_str":"632645991","name":"meche","screen_name":"mercedessreyes","location":null,"url":null,"description":"I mean, really it's same me, it's old me u2022 FSU '21 u2022 https://vsco.co/onlymeche","protected":false,"verified":false,"followers_count":1039,"friends_count":352,"listed_count":6,"favourites_count":21860,"statuses_count":21676,"created_at":"Wed Jul 11 04:06:28 +0000 2012","utc_offset":null,"time_zone":null,"geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"FCEBB6","profile_background_image_url":"http://pbs.twimg.com/profile_background_images/762423763/6c7d56ca20260816f75c10759208b283.png","profile_background_image_url_https":"https://pbs.twimg.com/profile_background_images/762423763/6c7d56ca20260816f75c10759208b283.png","profile_background_tile":true,"profile_link_color":"CE7834","profile_sidebar_border_color":"F0A830","profile_sidebar_fill_color":"78C0A8","profile_text_color":"5E412F","profile_use_background_image":true,"profile_image_url":"http://pbs.twimg.com/profile_images/876886584087502848/9WSQDm8F_normal.jpg","profile_image_url_https":"https://pbs.twimg.com/profile_images/876886584087502848/9WSQDm8F_normal.jpg","profile_banner_url":"https://pbs.twimg.com/profile_banners/632645991/1497147929","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Wed Jun 21 02:57:42 +0000 2017","id":877359845074018304,"id_str":"877359845074018304","text":"Well, I like dick, so I don't see this as a possibility, but thanks for trying to reach that far up my ass to try tu2026 https://t.co/lUJzY60Sn8","display_text_range":[0,140],"source":"u003ca href="http://twitter.com/download/iphone" rel="nofollow"u003eTwitter for iPhoneu003c/au003e","truncated":true,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":2341390003,"id_str":"2341390003","name":"roy","screen_name":"wreckitroy","location":"Fresno, CA","url":null,"description":"She said I'm looking like a bad man, smooth criminal. ud83cudf43 / snapchat/instagram: thericharrow","protected":false,"verified":false,"followers_count":4831,"friends_count":1103,"listed_count":23,"favourites_count":79829,"statuses_count":1012,"created_at":"Thu Feb 13 04:30:59 +0000 2014","utc_offset":null,"time_zone":null,"geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http://abs.twimg.com/images/themes/theme1/bg.png","profile_background_image_url_https":"https://abs.twimg.com/images/themes/theme1/bg.png","profile_background_tile":false,"profile_link_color":"1DA1F2","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http://pbs.twimg.com/profile_images/876941549874978816/eTGFmh8u_normal.jpg","profile_image_url_https":"https://pbs.twimg.com/profile_images/876941549874978816/eTGFmh8u_normal.jpg","profile_banner_url":"https://pbs.twimg.com/profile_banners/2341390003/1498157548","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"quoted_status_id":877359034621468672,"quoted_status_id_str":"877359034621468672","quoted_status":{"created_at":"Wed Jun 21 02:54:29 +0000 2017","id":877359034621468672,"id_str":"877359034621468672","text":"When you trying so hard to getvout the friend zoneud83dude02ud83dude02 https://t.co/i8yFNbGDNn","display_text_range":[0,52],"source":"u003ca href="http://twitter.com/download/iphone" rel="nofollow"u003eTwitter for iPhoneu003c/au003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":844510650,"id_str":"844510650","name":"u3164","screen_name":"DaddyGunPlay","location":null,"url":null,"description":"One of the best Contoller players dont @. Bo2 is surperior #JellyFamud83cudf47","protected":false,"verified":false,"followers_count":325,"friends_count":276,"listed_count":3,"favourites_count":1795,"statuses_count":5009,"created_at":"Mon Sep 24 23:51:03 +0000 2012","utc_offset":-25200,"time_zone":"Pacific Time (US & Canada)","geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http://abs.twimg.com/images/themes/theme1/bg.png","profile_background_image_url_https":"https://abs.twimg.com/images/themes/theme1/bg.png","profile_background_tile":false,"profile_link_color":"3B94D9","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"000000","profile_text_color":"000000","profile_use_background_image":false,"profile_image_url":"http://pbs.twimg.com/profile_images/874005327045414913/NUPA2rvD_normal.jpg","profile_image_url_https":"https://pbs.twimg.com/profile_images/874005327045414913/NUPA2rvD_normal.jpg","profile_banner_url":"https://pbs.twimg.com/profile_banners/844510650/1496174936","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"quoted_status_id":877210813462740992,"quoted_status_id_str":"877210813462740992","is_quote_status":true,"retweet_count":45,"favorite_count":138,"entities":{"hashtags":[],"urls":[{"url":"https://t.co/i8yFNbGDNn","expanded_url":"https://twitter.com/wreckitroy/status/877210813462740992","display_url":"twitter.com/wreckitroy/stau2026","indices":[53,76]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"is_quote_status":true,"extended_tweet":{"full_text":"Well, I like dick, so I don't see this as a possibility, but thanks for trying to reach that far up my ass to try to find the truth. ud83dude09 https://t.co/fv4Kqvv2sb","display_text_range":[0,134],"entities":{"hashtags":[],"urls":[{"url":"https://t.co/fv4Kqvv2sb","expanded_url":"https://twitter.com/daddygunplay/status/877359034621468672","display_url":"twitter.com/daddygunplay/su2026","indices":[135,158]}],"user_mentions":[],"symbols":[]}},"retweet_count":2496,"favorite_count":12594,"entities":{"hashtags":[],"urls":[{"url":"https://t.co/lUJzY60Sn8","expanded_url":"https://twitter.com/i/web/status/877359845074018304","display_url":"twitter.com/i/web/status/8u2026","indices":[117,140]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"quoted_status_id":877359034621468672,"quoted_status_id_str":"877359034621468672","quoted_status":{"created_at":"Wed Jun 21 02:54:29 +0000 2017","id":877359034621468672,"id_str":"877359034621468672","text":"When you trying so hard to getvout the friend zoneud83dude02ud83dude02 https://t.co/i8yFNbGDNn","display_text_range":[0,52],"source":"u003ca href="http://twitter.com/download/iphone" rel="nofollow"u003eTwitter for iPhoneu003c/au003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":844510650,"id_str":"844510650","name":"u3164","screen_name":"DaddyGunPlay","location":null,"url":null,"description":"One of the best Contoller players dont @. Bo2 is surperior #JellyFamud83cudf47","protected":false,"verified":false,"followers_count":325,"friends_count":276,"listed_count":3,"favourites_count":1795,"statuses_count":5009,"created_at":"Mon Sep 24 23:51:03 +0000 2012","utc_offset":-25200,"time_zone":"Pacific Time (US & Canada)","geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http://abs.twimg.com/images/themes/theme1/bg.png","profile_background_image_url_https":"https://abs.twimg.com/images/themes/theme1/bg.png","profile_background_tile":false,"profile_link_color":"3B94D9","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"000000","profile_text_color":"000000","profile_use_background_image":false,"profile_image_url":"http://pbs.twimg.com/profile_images/874005327045414913/NUPA2rvD_normal.jpg","profile_image_url_https":"https://pbs.twimg.com/profile_images/874005327045414913/NUPA2rvD_normal.jpg","profile_banner_url":"https://pbs.twimg.com/profile_banners/844510650/1496174936","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"quoted_status_id":877210813462740992,"quoted_status_id_str":"877210813462740992","is_quote_status":true,"retweet_count":45,"favorite_count":138,"entities":{"hashtags":[],"urls":[{"url":"https://t.co/i8yFNbGDNn","expanded_url":"https://twitter.com/wreckitroy/status/877210813462740992","display_url":"twitter.com/wreckitroy/stau2026","indices":[53,76]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"is_quote_status":true,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[{"url":"","expanded_url":null,"indices":[133,133]}],"user_mentions":[{"screen_name":"wreckitroy","name":"roy","id":2341390003,"id_str":"2341390003","indices":[3,14]}],"symbols":[]},"favorited":false,"retweeted":false,"filter_level":"low","lang":"en","timestamp_ms":"1498168218426"}
{"created_at":"Thu Jun 22 21:50:18 +0000 2017","id":878007262320754692,"id_str":"878007262320754692","text":"It makes me feel some type of way now bree got another lil boy friend","source":"u003ca href="http://twitter.com/download/iphone" rel="nofollow"u003eTwitter for iPhoneu003c/au003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":47587983,"id_str":"47587983","name":"Kee Gotti","screen_name":"_BadGalKee","location":"Columbus, OH","url":null,"description":"u2022 Instagram|_badgalkee u2022 SnapChat| kbabiy","protected":false,"verified":false,"followers_count":1107,"friends_count":639,"listed_count":12,"favourites_count":1160,"statuses_count":28359,"created_at":"Tue Jun 16 09:46:12 +0000 2009","utc_offset":-18000,"time_zone":"Quito","geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http://abs.twimg.com/images/themes/theme14/bg.gif","profile_background_image_url_https":"https://abs.twimg.com/images/themes/theme14/bg.gif","profile_background_tile":true,"profile_link_color":"009999","profile_sidebar_border_color":"EEEEEE","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http://pbs.twimg.com/profile_images/850590447261167616/MuywFrn8_normal.jpg","profile_image_url_https":"https://pbs.twimg.com/profile_images/850590447261167616/MuywFrn8_normal.jpg","profile_banner_url":"https://pbs.twimg.com/profile_banners/47587983/1487216863","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"filter_level":"low","lang":"en","timestamp_ms":"1498168218580"}
{"created_at":"Thu Jun 22 21:50:18 +0000 2017","id":878007263310393344,"id_str":"878007263310393344","text":"I liked a @YouTube video https://t.co/Znu4govqDi My Friend is in LOVE ...","source":"u003ca href="http://www.google.com/" rel="nofollow"u003eGoogleu003c/au003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":42287518,"id_str":"42287518","name":"David","screen_name":"iceman120","location":"FT LAUDERDALE, FL","url":"http://www.youtube.com/iceman120dl","description":"ue10eue10eOH YOU WANT SOME OF THISue12fue12fue12fue12fue10eue10e","protected":false,"verified":false,"followers_count":4667,"friends_count":361,"listed_count":69,"favourites_count":134,"statuses_count":69716,"created_at":"Sun May 24 21:43:04 +0000 2009","utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http://pbs.twimg.com/profile_background_images/53704022/ahamericanflag72.br.jpg","profile_background_image_url_https":"https://pbs.twimg.com/profile_background_images/53704022/ahamericanflag72.br.jpg","profile_background_tile":false,"profile_link_color":"D60000","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"1C1939","profile_text_color":"777777","profile_use_background_image":true,"profile_image_url":"http://pbs.twimg.com/profile_images/511261204120363008/DuNoXOXB_normal.jpeg","profile_image_url_https":"https://pbs.twimg.com/profile_images/511261204120363008/DuNoXOXB_normal.jpeg","profile_banner_url":"https://pbs.twimg.com/profile_banners/42287518/1375147278","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[{"url":"https://t.co/Znu4govqDi","expanded_url":"http://youtu.be/up6u1hzWHHc?a","display_url":"youtu.be/up6u1hzWHHc?a","indices":[25,48]}],"user_mentions":[{"screen_name":"YouTube","name":"YouTube","id":10228272,"id_str":"10228272","indices":[10,18]}],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en","timestamp_ms":"1498168218816"}

您已经发布了示例,据我所知,您只需要跳过空行。

下面的旧答案

您应该以这种方式解析 json:

...
with open('mytweets.json', mode='r', encoding='utf-8') as f:
    tweet = json.load(f)
    ...

json.load()接受类似文件的对象作为第一个参数。

您当前尝试做的是逐行读取文件并尝试将每一行解析为单独的 JSON 字符串,并且该文件似乎已格式化,因此您在任何行中都没有完整的 json。

你可能想遍历文件中的推文列表(如果我的猜测是正确的(,而不是文本行并在循环中调用print(preprocess())

最新更新