set hive.auto.convert.join=true;
set hive.mapjoin.smalltable.filesize=300000000;
set hive.auto.convert.join.noconditionaltask=true;
set hive.auto.convert.join.noconditionaltask.size=300000000;
INSERT INTO TABLE ArticleFeatures PARTITION(dt='${etl_dt}')
SELECT articleid AS articleid
              ,concat_ws(',',collect_set(t2.url_feature)) AS url_features
FROM (SELECT DISTINCT
                            articled AS articled
                            ,image_urls AS image_url
             FROM Articles
             LATERAW VIEW explode(splite(image_urls,',')) ) t1
             LEFT JOIN (SELECT url
                                   ,concat_ws(':',url,feature) as url_feature
                                 FROM ImageFeatures) t2
             ON t1.image_url = t2.url
group by articleid



直接手写的没跑过,肯定有问题(PS:我基本每条sql都要测试几遍,很少会有一边通过的样子不知道你们是不是一样),但是思路基本都是一样的,就是行转列再拼起来。关键是这个调优怎么做,很想知道一下大神的思路
个人调优思路:1、提前去重,减少数据量。数据本身较少,直接使用distinct
                          2、在设置中开启mapjoin,把小表直接加载进内存中join
感觉除此之外没啥要优化的了