Logstash 不会从 Twitter API 中过滤掉 JSON



我想删除不必要的字段。他们有很多。我正在为Logstash使用JSON过滤器插件,但它不能正常工作。它不想过滤数据,或者只是不将数据发送到输出。我曾尝试使用mutate字段,但没有成功。例如,我想删除entities字段,它是一个顶级字段,但我的配置都不起作用。我还想删除一些嵌套字段。。。

下面是我从Twitter API获得的JSON示例:

{
"retweet_count": 0,
"created_at": "Mon Dec 14 18:43:09 +0000 2020",
"place": null,
"in_reply_to_user_id_str": null,
"lang": "pl",
"filter_level": "low",
"possibly_sensitive": false,
"id": 1338555139993591800,
"id_str": "1338555139993591814",
"quote_count": 0,
"is_quote_status": false,
"geo": null,
"entities": {
"symbols": [],
"user_mentions": [],
"urls": [
{
"indices": [
117,
140
],
"url": "xxx",
"expanded_url": "xxx"
}
],
"hashtags": [
{
"text": "koronawirus",
"indices": [
84,
96
]
},
{
"text": "COVID19",
"indices": [
97,
105
]
},
{
"text": "Lockdown",
"indices": [
106,
115
]
}
]
},
"timestamp_ms": "1607971389183",
"reply_count": 0,
"retweeted": false,
"text": "W Wielkiej Brytanii wykryto nowy wariant koronawirusa. Kolejne kraje z lockdownem­čĹçnn#koronawirus #COVID19 #Lockdownnnxxx",
"contributors": null,
"truncated": false,
"in_reply_to_user_id": null,
"source": "<a href="xxx">Twitter Web App</a>",
"@timestamp": "2020-12-14T18:43:09.000Z",
"in_reply_to_screen_name": null,
"favorited": false,
"in_reply_to_status_id": null,
"user": {
"created_at": "Tue May 12 09:11:01 +0000 2009",
"profile_use_background_image": false,
"lang": null,
"contributors_enabled": false,
"profile_text_color": "000000",
"id": 39464882,
"id_str": "39464882",
"following": null,
"geo_enabled": false,
"profile_sidebar_fill_color": "000000",
"is_translator": false,
"protected": false,
"profile_image_url": "xxx",
"profile_link_color": "3B94D9",
"name": "Salon24.pl",
"profile_sidebar_border_color": "000000",
"favourites_count": 309,
"profile_background_image_url": "xxx",
"followers_count": 17473,
"description": null,
"location": "Polska",
"url": "xxx",
"profile_background_color": "000000",
"utc_offset": null,
"profile_background_image_url_https": "xxx",
"default_profile": false,
"follow_request_sent": null,
"verified": false,
"translator_type": "none",
"friends_count": 1028,
"time_zone": null,
"default_profile_image": false,
"screen_name": "Salon24pl",
"profile_image_url_https": "xxx",
"statuses_count": 48490,
"notifications": null,
"listed_count": 203,
"profile_background_tile": false
},
"in_reply_to_status_id_str": null,
"favorite_count": 0,
"@version": "1",
"coordinates": null
}

这是我的实际配置:

input {
twitter {
id => "logstash_to_kafka_plugin"
consumer_key => "xxx"
consumer_secret => "xxx"
oauth_token => "xxx"
oauth_token_secret => "xxx"
keywords => [ "koronawirus" ]
full_tweet => true
ignore_retweets => true     
}   
}
filter {
json {
source => "message"
remove_field => [ "[message][entities]"]
}
}
output {
kafka {
codec => json
topic_id => "twitter_tweets"
}
}

我尝试了不同的方式来指示该字段,例如:

remove_field => [ "entities" ] or
remove_field => [ "[entities]" ]

但这也没用。

尝试在json过滤器块后添加一个带有remove_field的mutate过滤器,以便在通过json过滤器将字段添加到根之后执行新的mutate筛选器。你的过滤器可能看起来像

filter {
json {
source => "message" 
}
mutate {
remove_field => ["entities", "user.created_at"] // works for nested field as well
}
}

最新更新