pyspark的filter()、distinct()、sortBy() 函数

发布时间 2023-08-13 11:00:51作者: steve.z

#
#   py_pyspark_test.py
#   py_learn
#
#   Created by Z. Steve on 2023/8/12 17:38.
#


from pyspark import SparkConf, SparkContext

conf = SparkConf().setMaster("local[*]").setAppName("rdd_test")

sc = SparkContext(conf=conf)

# rdd = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 19, 20])

rdd = sc.parallelize((("abc", 1), ("def", 10), ("ghi", 11), ("jklc", 21), ("nmnl", 10), ("abxxxc", 101)))

# # 1. rdd.filter()
# result = rdd.filter(lambda x: x % 2 == 0)
# print(result.collect())

# # 2. rdd.distinct() 去除重复
# r1 = rdd.distinct()
# print(r1.collect())


# 3. sortBy()
result = rdd.sortBy(lambda x: x[1], ascending=False, numPartitions=1)
print(result.collect())