#
# py_pyspark_test.py
# py_learn
#
# Created by Z. Steve on 2023/8/12 17:38.
#
from pyspark import SparkConf, SparkContext
conf = SparkConf().setMaster("local[*]").setAppName("rdd_test")
sc = SparkContext(conf=conf)
# rdd = sc.parallelize([1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 19, 20])
rdd = sc.parallelize((("abc", 1), ("def", 10), ("ghi", 11), ("jklc", 21), ("nmnl", 10), ("abxxxc", 101)))
# # 1. rdd.filter()
# result = rdd.filter(lambda x: x % 2 == 0)
# print(result.collect())
# # 2. rdd.distinct() 去除重复
# r1 = rdd.distinct()
# print(r1.collect())
# 3. sortBy()
result = rdd.sortBy(lambda x: x[1], ascending=False, numPartitions=1)
print(result.collect())
标签:pyspark,distinct,filter,rdd,result,sortBy
From: https://www.cnblogs.com/zxhoo/p/17626276.html