BigQuery:使用_TABLE_SUFFIX在start_date列前28天的窗口内迭代组 &g



我得到了这样一个表:

<表类> group_id start_date end_date tbody><<tr>193352022061320220714195272022062020220719193392022061420220720194362022061620220715200952022071120220809

我能想到两种方法。

  1. 连接所有表,然后执行查询。
  2. 为每个用户创建动态查询

两种方法都需要search_fromsearch_to事先可用,即你需要在做任何事情之前计算每个用户的搜索范围。
,

WITH users_per_group AS (
SELECT 
user_id, group_id
,DATE_SUB(parse_date("%Y%m%d", start_date), INTERVAL 4 DAY)search_from
,DATE_SUB(parse_date("%Y%m%d", start_date), INTERVAL 1 DAY)search_to
FROM TableName
) 

一旦你有了这样的表,你就可以使用上述任何一种方法。

因为我没有你的数据,不知道你的表名,我给了一个例子,使用公共数据集方式1。

-- consider this your main table which contains user,grp,start_date,end_date
with  maintable as (
select 'India' visit_from,  '20161115' as start_date, '20161202' end_date
union all select 'Sweden'  , '20161201', '20161202' 
),
--then calculate  search from-to date for every user and group
user_per_grp as( 
select *, DATE_SUB(parse_date("%Y%m%d", start_date), INTERVAL 4 DAY)search_from  --change interval as per your need
,DATE_SUB(parse_date("%Y%m%d", start_date), INTERVAL 1 DAY)search_to
from maintable 
)
select visit_from,_TABLE_SUFFIX date,count(visitId) total_visits  from
user_per_grp ug 
left join `bigquery-public-data.google_analytics_sample.ga_sessions_*` as pub on pub.geoNetwork.country = ug.visit_from
where _TABLE_SUFFIX between format_date("%Y%m%d",ug.search_from) and format_date("%Y%m%d",ug.search_to)
group by 1,2

方法2

declare queries array<string> default [];
create temp table  maintable as (
select 'India' visit_from,  '20161115' as start_date, '20161202' end_date
union all select 'Sweden'  , '20161201', '20161202' 
);
create temp table user_per_grp as( 
select *, DATE_SUB(parse_date("%Y%m%d", start_date), INTERVAL 4 DAY)search_from
,DATE_SUB(parse_date("%Y%m%d", start_date), INTERVAL 1 DAY)search_to
from maintable 
);
-- for each user create a seperate query here 
FOR record IN (SELECT * from user_per_grp)
DO
set queries =  queries || [format('select "%s" Visit_From,_TABLE_SUFFIX Date,count(visitId) total_visits  from `bigquery-public-data.google_analytics_sample.ga_sessions_*` where _TABLE_SUFFIX between format_date("%%Y%%m%%d","%t") and format_date("%%Y%%m%%d","%t") and geoNetwork.country="%s" group by 1,2',record.visit_from,record.search_from,record.search_to,record.visit_from)];
--replace your query here. 
END FOR;
--aggregating all the queries and executing it
execute immediate (select string_agg(query, ' union all ') from unnest(queries) query);

这里,第二种方法处理的数据(~750 KB)比第一种方法(~17 MB)少得多。但对于你的数据集来说,这可能不一样,因为两个用户的日期范围可能重叠,这将导致读取同一个表两次。

相关内容

  • 没有找到相关文章

最新更新