Athena:按此规模因子查询已耗尽的资源.如何优化给定的查询



我使用下面的查询在AWS Athena上执行。它给出错误'在此比例因子下查询耗尽的资源'。"。在查询统计选项卡中,我看到扫描的数据大约是36gb。

WITH session_dataset AS (
SELECT user_id,
max(medium) as medium,
max(event_date) as event_date,
session_id
FROM view_session
where date(event_date) <= date_add('day', - 1, current_date)
and date(event_date) >= date_add('day', - 90, current_date)
and category not in ('Offline Sources')
GROUP BY user_id,
session_id
),
user_conversion AS (
select user_id,
session_id,
name,
event_date,
has_crm,
customer_retention_type
from view_session
where cohort_type = 'conversion'
and name is not null
and date(event_date) <= date_add('day', - 1, current_date)
and date(event_date) >= date_add('day', - 90, current_date)
),
dataset_yesterday AS (
SELECT uc.user_id,
uc.name,
max(uc.has_crm) as has_crm,
max(uc.customer_retention_type) as customer_retention_type,
count(sd.session_id) as view_count,
date_diff(
'day',
date(min(sd.event_date)),
date(max(uc.event_date))
) AS days_convert,
array_agg(
sd.medium
order by sd.event_date,
split(sd.session_id, '_') [ 1 ] asc
) as medium_list
FROM session_dataset sd,
user_conversion uc
where date(sd.event_date) <= date(uc.event_date)
and date(sd.event_date) >= date_add('day', - 1, current_date)
and uc.user_id = sd.user_id
and split(uc.session_id, '_') [ 1 ] >= split(sd.session_id, '_') [ 1 ]
GROUP BY uc.user_id,
uc.session_id,
uc.name
),
dataset_week AS (
SELECT uc.user_id,
uc.name,
max(uc.has_crm) as has_crm,
max(uc.customer_retention_type) as customer_retention_type,
count(sd.session_id) as view_count,
date_diff(
'day',
date(min(sd.event_date)),
date(max(uc.event_date))
) AS days_convert,
array_agg(
sd.medium
order by sd.event_date,
split(sd.session_id, '_') [ 1 ] asc
) as medium_list
FROM session_dataset sd,
user_conversion uc
where date(sd.event_date) <= date(uc.event_date)
and date(sd.event_date) >= date_add('day', - 7, current_date)
and uc.user_id = sd.user_id
and split(uc.session_id, '_') [ 1 ] >= split(sd.session_id, '_') [ 1 ]
GROUP BY uc.user_id,
uc.session_id,
uc.name
),
dataset_month AS (
SELECT uc.user_id,
uc.name,
max(uc.has_crm) as has_crm,
max(uc.customer_retention_type) as customer_retention_type,
count(sd.session_id) as view_count,
date_diff(
'day',
date(min(sd.event_date)),
date(max(uc.event_date))
) AS days_convert,
array_agg(
sd.medium
order by sd.event_date,
split(sd.session_id, '_') [ 1 ] asc
) as medium_list
FROM session_dataset sd,
user_conversion uc
where date(sd.event_date) <= date(uc.event_date)
and date(sd.event_date) >= date_add('day', - 30, current_date)
and uc.user_id = sd.user_id
and split(uc.session_id, '_') [ 1 ] >= split(sd.session_id, '_') [ 1 ]
GROUP BY uc.user_id,
uc.session_id,
uc.name
),
dataset_quarter AS (
SELECT uc.user_id,
uc.name,
max(uc.has_crm) as has_crm,
max(uc.customer_retention_type) as customer_retention_type,
count(sd.session_id) as view_count,
date_diff(
'day',
date(min(sd.event_date)),
date(max(uc.event_date))
) AS days_convert,
array_agg(
sd.medium
order by sd.event_date,
split(sd.session_id, '_') [ 1 ] asc
) as medium_list
FROM session_dataset sd,
user_conversion uc
where date(sd.event_date) <= date(uc.event_date)
and date(sd.event_date) >= date_add('day', - 90, current_date)
and uc.user_id = sd.user_id
and split(uc.session_id, '_') [ 1 ] >= split(sd.session_id, '_') [ 1 ]
GROUP BY uc.user_id,
uc.session_id,
uc.name
)
select 'yesterday' as window,
name,
sum(days_convert) as days_convert,
count(name) as total_conversion,
sum(view_count) as total_view,
count(
distinct IF(has_crm = '1', user_id, NULL)
) AS customer_count,
count(distinct IF(has_crm != '1' or has_crm is null, user_id, NULL)) AS anonymous_customer_count,
count(
distinct IF(
lower(customer_retention_type) = 'returning',
user_id,
NULL
)
) AS returning_customer_count,
count(
distinct IF(
lower(customer_retention_type) = 'new',
user_id,
NULL
)
) AS new_customer_count,
medium_list [ 1 ] as first_click,
medium_list [ cardinality(medium_list) ] as last_click,
medium_list
from dataset_yesterday
group by name,
medium_list
union all
select 'month' as window,
name,
sum(days_convert) as days_convert,
count(name) as total_conversion,
sum(view_count) as total_view,
count(
distinct IF(has_crm = '1', user_id, NULL)
) AS customer_count,
count(distinct IF(has_crm != '1' or has_crm is null, user_id, NULL)) AS anonymous_customer_count,
count(
distinct IF(
lower(customer_retention_type) = 'returning',
user_id,
NULL
)
) AS returning_customer_count,
count(
distinct IF(
lower(customer_retention_type) = 'new',
user_id,
NULL
)
) AS new_customer_count,
medium_list [ 1 ] as first_click,
medium_list [ cardinality(medium_list) ] as last_click,
medium_list
from dataset_month
group by name,
medium_list
union all
union all
select 'quarter' as window,
name,
sum(days_convert) as days_convert,
count(name) as total_conversion,
sum(view_count) as total_view,
count(
distinct IF(has_crm = '1', user_id, NULL)
) AS customer_count,
count(distinct IF(has_crm != '1' or has_crm is null, user_id, NULL)) AS anonymous_customer_count,
count(
distinct IF(
lower(customer_retention_type) = 'returning',
user_id,
NULL
)
) AS returning_customer_count,
count(
distinct IF(
lower(customer_retention_type) = 'new',
user_id,
NULL
)
) AS new_customer_count,
medium_list [ 1 ] as first_click,
medium_list [ cardinality(medium_list) ] as last_click,
from dataset_quarter
group by name,
medium_list

我在Stack Overflow中发现了类似的查询。在一篇帖子中,他们要求删除按条款订购的订单。

我如何在上面的查询中做到这一点?

我会从你的WHERE子句不可sargable开始,这意味着事件的date()函数阻止任何索引被用于此。而你的"离线资源"的类别不让其他所有东西变得毫无价值,并且正在扼杀你的表现。

也就是说,您正在提取DATE(event_date),向我表明event_date列实际上是一个日期/时间字段,并查找1-90天前的事件。我来问一下。这难道不等于今天中午12点,也就是一天结束的时候吗?

因此,如果今天是12月23日上午8:47,而您正在寻找比当前日期(12月23日)更短的日期,则您已经将时间截断为'2022-12-23 00:00:00am'。因此,12月22日晚上11:59的事件日期仍然小于当前日期。你不必执行date(event_date)来获取2022-12-22日期,它已经(包括时间)小于12月23日。所以现在你已经去掉了一部分不可分割的因素。

现在,对回溯到多远的地方执行相同的上下文。如果你想往回看90天,那么你的日期应该大于或等于"2022-09-24 00:00:00am"。所以9月24日上午12:01开始的比赛仍然符合问题范围。现在,您已经删除了第二个不可搜索的内容,并且可以使用对

进行简单调整的索引。
where 
event_date >= date_add('day', - 90, current_date)
and event_date < current_date
and category not in ('Offline Sources')

现在,看看其他可能的优化。索引。我建议存在以下内容

table         index
view_session  ( event_date, user_id, session_id )

对于join,尽量避免使用30年前用逗号分隔表的SQL。使用JOIN语法代替,它将有助于防止意外的笛卡尔连接,并且在将来需要更改时更容易调整为左/右/外连接。

在您的"昨天"查询中,您有一个COUNT(session_id),但也有一个session_id组,每次会话将始终返回1条记录,表明我在这个意图中有些东西是不正确的。如果你得到的会话计数,你不想通过他们组,因为你正在建立一个聚合(array_agg)的他们,所以我已经调整。其他查询也一样。

我还通过名称删除了组,因为您是通过user_id分组的,这将表明名称也是相同的。因此,我将max(uc.name)作为名称,因为无论如何它都不会更改每个user_id。

使用会话的SPLIT函数,编辑您的帖子并显示一些来自view_Session表的数据。它会更好地帮助你试图做什么,通过'拆分'字符串,然后似乎正在寻找通过[1]上下文拆分字符串的第一个实例。

split(uc.session_id, '_') [ 1 ]

的语法似乎与函数调用的语法不匹配,而应该是

split(uc.session_id, '_', 1)

表示无论有多少个会话,您都希望从字符串中获得第一个会话id。但这也不是正确的数据规范化。

使用date()功能对昨天、星期、月和季度的派生查询我没有更改,因为我不知道会话与用户转换事件日期的比较基础。由于派生查询没有索引可以应用,因此我无法对这些查询的可搜索性做任何事情,但至少应该优化数据的预查询。

最后你的查询是做UNION ALL,你是分组的名称(不是ID)和中间列表。这是正确的吗?如果你有100个人都叫"约翰"呢?为什么要按名字分组而不是按ID分组?不知道,但不管它,这是你的查询:)

所以,从以上所有的观点来看,我已经将查询修改为下面的最终状态,并且可以根据遇到的任何问题或您需要的反馈进行编辑。

WITH session_dataset AS 
(
SELECT 
user_id,
session_id
max(medium) as medium,
max(event_date) as event_date,
FROM 
view_session
where 
event_date >= date_add('day', - 90, current_date)
and event_date < current_date
and category not in ('Offline Sources')
GROUP BY 
user_id,
session_id
),
user_conversion AS 
(
select 
user_id,
session_id,
name,
event_date,
has_crm,
customer_retention_type
from 
view_session
where 
event_date >= date_add('day', - 90, current_date)
and event_date < current_date
and cohort_type = 'conversion'
and name is not null
),
dataset_yesterday AS 
(
SELECT 
uc.user_id,
max( uc.name ) name,
max(uc.has_crm) as has_crm,
max(uc.customer_retention_type) as customer_retention_type,
count(sd.session_id) as view_count,
date_diff( 'day',
date(min(sd.event_date)),
date(max(uc.event_date))
) AS days_convert,
array_agg (
sd.medium
order by sd.event_date,
split(sd.session_id, '_') [ 1 ] asc
) as medium_list
FROM 
session_dataset sd
JOIN user_conversion uc
on sd.user_id = uc.user_id
where 
date(sd.event_date) <= date(uc.event_date)
and date(sd.event_date) >= date_add('day', - 1, current_date)
and split(uc.session_id, '_') [ 1 ] >= split(sd.session_id, '_') [ 1 ]
GROUP BY 
uc.user_id
),
dataset_week AS 
(
SELECT 
uc.user_id,
max( uc.name ) name,
max(uc.has_crm) as has_crm,
max(uc.customer_retention_type) as customer_retention_type,
count(sd.session_id) as view_count,
date_diff( 'day',
date(min(sd.event_date)),
date(max(uc.event_date))
) AS days_convert,
array_agg(
sd.medium
order by sd.event_date,
split(sd.session_id, '_', 1 ) asc
) as medium_list
FROM 
session_dataset sd
JOIN user_conversion uc
on sd.user_id = uc.user_id
where 
date(sd.event_date) <= date(uc.event_date)
and date(sd.event_date) >= date_add('day', - 7, current_date)
and split(uc.session_id, '_', 1 ) >= split(sd.session_id, '_', 1 )
GROUP BY 
uc.user_id
),
dataset_month AS 
(
SELECT 
uc.user_id,
max( uc.name ) name,
max(uc.has_crm) as has_crm,
max(uc.customer_retention_type) as customer_retention_type,
count(sd.session_id) as view_count,
date_diff( 'day',
date(min(sd.event_date)),
date(max(uc.event_date))
) AS days_convert,
array_agg(
sd.medium
order by sd.event_date,
split(sd.session_id, '_', 1 ) asc
) as medium_list
FROM 
session_dataset sd
JOIN user_conversion uc
on sd.user_id = uc.user_id
where 
date(sd.event_date) <= date(uc.event_date)
and date(sd.event_date) >= date_add('day', - 30, current_date)
and split(uc.session_id, '_', 1 ) >= split(sd.session_id, '_', 1 )
GROUP BY 
uc.user_id
),
dataset_quarter AS 
(
SELECT 
uc.user_id,
max( uc.name ) name,
max(uc.has_crm) as has_crm,
max(uc.customer_retention_type) as customer_retention_type,
count(sd.session_id) as view_count,
date_diff( 'day',
date(min(sd.event_date)),
date(max(uc.event_date))
) AS days_convert,
array_agg(
sd.medium
order by sd.event_date,
split(sd.session_id, '_', 1 ) asc
) as medium_list
FROM 
session_dataset sd
JOIN user_conversion uc
ON sd.user_id = uc.user_id
where 
date(sd.event_date) <= date(uc.event_date)
and date(sd.event_date) >= date_add('day', - 90, current_date)
and split(uc.session_id, '_', 1 ) >= split(sd.session_id, '_', 1 )
GROUP BY 
uc.user_id
)
select 
'yesterday' as window,
name,
sum(days_convert) as days_convert,
count(name) as total_conversion,
sum(view_count) as total_view,
count( distinct IF(has_crm = '1', user_id, NULL) ) AS customer_count,
count(distinct IF(has_crm != '1' or has_crm is null, user_id, NULL)) AS anonymous_customer_count,
count( distinct IF( lower(customer_retention_type) = 'returning',
user_id, NULL )
) AS returning_customer_count,
count( distinct IF( lower(customer_retention_type) = 'new',
user_id, NULL )
) AS new_customer_count,
medium_list [ 1 ] as first_click,
medium_list [ cardinality(medium_list) ] as last_click,
medium_list
from 
dataset_yesterday
group by 
name,
medium_list
union all
select 
'month' as window,
name,
sum(days_convert) as days_convert,
count(name) as total_conversion,
sum(view_count) as total_view,
count( distinct IF(has_crm = '1', user_id, NULL) ) AS customer_count,
count(distinct IF(has_crm != '1' or has_crm is null, user_id, NULL)) AS anonymous_customer_count,
count( distinct IF( lower(customer_retention_type) = 'returning',
user_id, NULL )
) AS returning_customer_count,
count( distinct IF( lower(customer_retention_type) = 'new',
user_id, NULL )
) AS new_customer_count,
medium_list [ 1 ] as first_click,
medium_list [ cardinality(medium_list) ] as last_click,
medium_list
from 
dataset_month
group by 
name,
medium_list
union all
select 
'quarter' as window,
name,
sum(days_convert) as days_convert,
count(name) as total_conversion,
sum(view_count) as total_view,
count( distinct IF(has_crm = '1', user_id, NULL) ) AS customer_count,
count(distinct IF(has_crm != '1' or has_crm is null, user_id, NULL)) AS anonymous_customer_count,
count( distinct IF( lower(customer_retention_type) = 'returning',
user_id, NULL )
) AS returning_customer_count,
count( distinct IF( lower(customer_retention_type) = 'new',
user_id, NULL )
) AS new_customer_count,
medium_list [ 1 ] as first_click,
medium_list [ cardinality(medium_list) ] as last_click,
from 
dataset_quarter
group by 
name,
medium_list

从你的评论…我不知道你对你的数据库有多了解,但我不建议你这样做。解析和拆分函数以获取部分可能是影响性能的罪魁祸首。最好使用一个具有关键一次性值的会话表。然后是一个SessionEvent表,它具有原始会话表的ID,但具有后续的单独部分,如EventUtcTime, EventAction, WhateverELseEventSpecific。

然后,您可以轻松地查询
select
se.EventAction,
count(*) 
from
SessionEvent se
group by
se.EventAction

和类似的日期之间的天数,事件截止日期等。不过,这只是一个建议,显然不足以提供更好的选择。

最新更新