常用spark优化参数

发布时间 2023-03-30 12:43:14作者: DB乐之者

常用spark优化参数

强制使用spark engine

set tqs.query.engine.type = sparkCli;
set spark.yarn.priority = 4;

双写HDFS开启:

set spark.shuffle.hdfs.enable=true;
set spark.shuffle.io.maxRetries=1;
set spark.shuffle.io.retryWait=0s;
set spark.network.timeout=120s;

## 双写HDFS开启避免fetch failed,且基本上只有20min以上大任务再开启

调整全局任务并行度

set spark.sql.shuffle.partitions=400;
set spark.default.paralleism=400;
set spark.executore.cores=4;

动态资源申请

set spark.dynamicAllocation = True;
set spark.dynamicAllocation.minExecutors = 30;
set spark.dynamicAllocation.maxExecutors = 200;
set spark.dynamicAllocation.initExectors = 30;

## 动态资源申请,保证尽快起任务,不适用时归还资源

memory

set spark.exector.memory=10g;
set spark.executor.memoryOverhead=10g;
set spark.driver.memory=3g;

## memory:executor memory = memory + memoryoverhead

join

set spark.shuffle.statistic.verbose=true; -- 收集join数据
set spark.sql.join.perferSortMergejoin=false; -- disable sort to enable hash
set spark.sql.autoBroadcastJoinThreshold=134217728; -- 如果不设置跟autoBroadcastJoinThreshold一致,则被覆盖

AE:skewed

set spark.sql.adaptive.skewedJoin.enable=true;
set spark.sql.adaptive.skewedpartitionMaxSplits=3;
set spark.sql.adaptive.skewedPartitionFactor=3;
set spark.sql.adaptive.skewedPartitionSizeThreshold=52428800;
set spark.sql.adaptive.skewedPartitionRowCountThreshold=5000000;

AE:partition

set spark.sql.adaptive.maxNumPostShufflePartitions=1000;
set spark.sql.adaptive.minNumPostShufflePartitions=10;
set spark.sql.adaptive.shuffle.targetPostShuffleInputSize=60;

## 解决partition太多reducer生成太多文件的问题,自动进行文件合并;

input

set spark.sql.hive.convertMetastoreParquet=true;
set spark.sql.parquet.adaptiveFileSplit=true;
set spark.sql.files.maxPartitionBytes=314572800;
set spark.sql.files.openCostinBytes=16777216;

Output