pyspark 常用Transform算子

发布时间 2023-10-16 17:19:03作者: whiteY
from pyspark import SparkConf, SparkContext

conf = SparkConf().setAppName("test_SamShare").setMaster("local[4]")
sc = SparkContext(conf=conf)

# 1.map对每一个元素进行一个映射
rdd = sc.parallelize(range(1, 11), 4)
rdd_map = rdd.map(lambda x: x * 2)
print("rdd", rdd.collect())
print("rdd_map", rdd_map.collect())

# rdd [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
# rdd_map [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]

# 2.flatmap把高维的数组变成一维
rdd = sc.parallelize(["hello hadoop", "hello pyspark"])
print("原始数据", rdd.collect())
print("map", rdd.map(lambda x: x.split(" ")).collect())
print("flatmap", rdd.flatMap(lambda x: x.split(" ")).collect())

# 原始数据 ['hello hadoop', 'hello pyspark']
# map [['hello', 'hadoop'], ['hello', 'pyspark']]
# flatmap ['hello', 'hadoop', 'hello', 'pyspark']

# 3.filter过滤数据
rdd =sc.parallelize(range(1, 11), 4)
print("原始数据", rdd.collect())
print("输出偶数", rdd.filter(lambda x: x % 2 == 0).collect())

# 原始数据 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
# 输出偶数 [2, 4, 6, 8, 10]

# 4.distinct去重元素
rdd = sc.parallelize([1, 2, 3, 2, 5, 6, 8, 9, 8, 5, 1])
print("原始数据", rdd.collect())
print("去重数据", rdd.distinct().collect())

# 原始数据 [1, 2, 3, 2, 5, 6, 8, 9, 8, 5, 1]
# 去重数据 [8, 1, 5, 9, 2, 6, 3]


# 5.reduceBykey 根据Key来映射数据

rdd = sc.parallelize([("a", 1), ("b", 1), ("c", 1), ("b", 1), ("c", 1)])
print("原始数据", rdd.collect())
5.1 
print("累加后数据", rdd.reduceByKey(lambda a, b: a + b).collect())

5.2 
from operator import add
print("累加后数据", rdd.reduceByKey(add).collect())

# 原始数据 [('a', 1), ('b', 1), ('c', 1), ('b', 1), ('c', 1)]
# 累加后数据 [('b', 2), ('c', 2), ('a', 1)]

# 6.mapPartitions 根据分区内数据进行映射操作
rdd = sc.parallelize([1, 2, 3, 4, 5], 3)


def f(iterator):
    yield sum(iterator)


print("原始数据", rdd.collect())
print("映射数据", rdd.mapPartitions(f).collect())

# 原始数据 [1, 2, 3, 4, 5]
# 映射数据 [1, 5, 9]


# 7.sortBy根据规则进行排序

rdd = sc.parallelize([('b', 1), ('a', 2), ('d', 3)])
print("原始数据", rdd.collect())
print("排序数据", rdd.sortBy(lambda x: x[0]).collect())
print("排序数据", rdd.sortBy(lambda x: x[1]).collect())

# 原始数据 [('b', 1), ('a', 2), ('d', 3)]
# 排序数据 [('a', 2), ('b', 1), ('d', 3)]
# 排序数据 [('b', 1), ('a', 2), ('d', 3)]

# 8.subtract 数据集相减

x = sc.parallelize([('a', 1), ('b', 2), ('c', 3)])
y = sc.parallelize([('c', 3), ('b', None)])
print(sorted(x.subtract(y).collect()))

# [('a', 1), ('b', 2)]

# 9.union 合并两个RDD

rdd1 = sc.parallelize([1, 2, 3])
rdd2 = sc.parallelize([4, 5, 6])
print(rdd1.union(rdd2).collect())

# [1, 2, 3, 4, 5, 6]

# 10.intersection 取两个RDD的交集且去重

rdd1 = sc.parallelize([1, 2, 3, 4, 5, 6])
rdd2 = sc.parallelize([2, 4, 6, 8, 1])
print(rdd1.intersection(rdd2).collect())

# [1, 2, 4, 6]

# 11.cartesian 生成笛卡尔积

rdd = sc.parallelize([1, 3, 5])
print(rdd.cartesian(rdd).collect())

# [(1, 1), (1, 3), (1, 5), (3, 1), (3, 3), (3, 5), (5, 1), (5, 3), (5, 5)]

# 12.zip  拉链合并 ,需要两个相同长度以及分区数量

x = sc.parallelize(range(0, 5))
y = sc.parallelize(range(1000, 1005))
print(x.collect())
print(y.collect())
print(x.zip(y).collect())

# [0, 1, 2, 3, 4]
# [1000, 1001, 1002, 1003, 1004]
# [(0, 1000), (1, 1001), (2, 1002), (3, 1003), (4, 1004)]

# 13.zipWithIndex 将RDD和一个从0开始的递增序列按照拉链方式连接

rdd_name = sc.parallelize(["hive", "spark", "hbase", "hdfs"])
rdd_index = rdd_name.zipWithIndex()
print(rdd_index.collect())

# [('hive', 0), ('spark', 1), ('hbase', 2), ('hdfs', 3)]

# 14.groupByKey  按照KEY来聚合数据

rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
print(rdd.collect())
print(rdd.groupByKey().mapValues(len).collect())
print(rdd.groupByKey().mapValues(list).collect())

# [('a', 1), ('b', 1), ('a', 1)]
# [('b', 1), ('a', 2)]
# [('b', [1]), ('a', [1, 1])]

# 15.sortByKey(True, 2) True 升序,Fales 倒序
rdd = sc.parallelize([("a", 1), ("b", 2), ("1", 3),("c", 1)])

print(rdd.sortByKey(False, 2).collect())

# [('1', 3), ('a', 1), ('b', 2), ('c', 1)]

# 16.join

x = sc.parallelize([('a', 1), ('b', 3)])
y = sc.parallelize([('a', 2), ('c', 1), ('a', 3)])
print(x.join(y).collect())

# [('a', (1, 2)), ('a', (1, 3))]

# 17.leftOutJoin/rightOutJoin

x = sc.parallelize([('a', 1), ('b', 2)])
y = sc.parallelize([('a', 2)])
print(x.leftOuterJoin(y).collect())

# [('a', (1, 2)), ('b', (2, None))]