set hive.auto.convert.join=true;
set hive.mapjoin.smalltable.filesize=300000000;
set hive.auto.convert.join.noconditionaltask=true;
set hive.auto.convert.join.noconditionaltask.size=300000000;
INSERT INTO TABLE ArticleFeatures PARTITION(dt='${etl_dt}')
SELECT articleid AS articleid
,concat_ws(',',collect_set(t2.url_feature)) AS url_features
FROM (SELECT DISTINCT
articled AS articled
,image_urls AS image_url
FROM Articles
LATERAW VIEW explode(splite(image_urls,',')) ) t1
LEFT JOIN (SELECT url
,concat_ws(':',url,feature) as url_feature
FROM ImageFeatures) t2
ON t1.image_url = t2.url
group by articleid
直接手写的没跑过,肯定有问题(PS:我基本每条sql都要测试几遍,很少会有一边通过的样子不知道你们是不是一样),但是思路基本都是一样的,就是行转列再拼起来。关键是这个调优怎么做,很想知道一下大神的思路
个人调优思路:1、提前去重,减少数据量。数据本身较少,直接使用distinct
2、在设置中开启mapjoin,把小表直接加载进内存中join
感觉除此之外没啥要优化的了