from pyspark import SparkConf, SparkContext
conf = SparkConf().setAppName("test_SamShare").setMaster("local[4]")
sc = SparkContext(conf=conf)
# 1.map对每一个元素进行一个映射
rdd = sc.parallelize(range(1, 11), 4)
rdd_map = rdd.map(lambda x: x * 2)
print("rdd", rdd.collect())
print("rdd_map", rdd_map.collect())
# rdd [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
# rdd_map [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
# 2.flatmap把高维的数组变成一维
rdd = sc.parallelize(["hello hadoop", "hello pyspark"])
print("原始数据", rdd.collect())
print("map", rdd.map(lambda x: x.split(" ")).collect())
print("flatmap", rdd.flatMap(lambda x: x.split(" ")).collect())
# 原始数据 ['hello hadoop', 'hello pyspark']
# map [['hello', 'hadoop'], ['hello', 'pyspark']]
# flatmap ['hello', 'hadoop', 'hello', 'pyspark']
# 3.filter过滤数据
rdd =sc.parallelize(range(1, 11), 4)
print("原始数据", rdd.collect())
print("输出偶数", rdd.filter(lambda x: x % 2 == 0).collect())
# 原始数据 [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
# 输出偶数 [2, 4, 6, 8, 10]
# 4.distinct去重元素
rdd = sc.parallelize([1, 2, 3, 2, 5, 6, 8, 9, 8, 5, 1])
print("原始数据", rdd.collect())
print("去重数据", rdd.distinct().collect())
# 原始数据 [1, 2, 3, 2, 5, 6, 8, 9, 8, 5, 1]
# 去重数据 [8, 1, 5, 9, 2, 6, 3]
# 5.reduceBykey 根据Key来映射数据
rdd = sc.parallelize([("a", 1), ("b", 1), ("c", 1), ("b", 1), ("c", 1)])
print("原始数据", rdd.collect())
5.1
print("累加后数据", rdd.reduceByKey(lambda a, b: a + b).collect())
5.2
from operator import add
print("累加后数据", rdd.reduceByKey(add).collect())
# 原始数据 [('a', 1), ('b', 1), ('c', 1), ('b', 1), ('c', 1)]
# 累加后数据 [('b', 2), ('c', 2), ('a', 1)]
# 6.mapPartitions 根据分区内数据进行映射操作
rdd = sc.parallelize([1, 2, 3, 4, 5], 3)
def f(iterator):
yield sum(iterator)
print("原始数据", rdd.collect())
print("映射数据", rdd.mapPartitions(f).collect())
# 原始数据 [1, 2, 3, 4, 5]
# 映射数据 [1, 5, 9]
# 7.sortBy根据规则进行排序
rdd = sc.parallelize([('b', 1), ('a', 2), ('d', 3)])
print("原始数据", rdd.collect())
print("排序数据", rdd.sortBy(lambda x: x[0]).collect())
print("排序数据", rdd.sortBy(lambda x: x[1]).collect())
# 原始数据 [('b', 1), ('a', 2), ('d', 3)]
# 排序数据 [('a', 2), ('b', 1), ('d', 3)]
# 排序数据 [('b', 1), ('a', 2), ('d', 3)]
# 8.subtract 数据集相减
x = sc.parallelize([('a', 1), ('b', 2), ('c', 3)])
y = sc.parallelize([('c', 3), ('b', None)])
print(sorted(x.subtract(y).collect()))
# [('a', 1), ('b', 2)]
# 9.union 合并两个RDD
rdd1 = sc.parallelize([1, 2, 3])
rdd2 = sc.parallelize([4, 5, 6])
print(rdd1.union(rdd2).collect())
# [1, 2, 3, 4, 5, 6]
# 10.intersection 取两个RDD的交集且去重
rdd1 = sc.parallelize([1, 2, 3, 4, 5, 6])
rdd2 = sc.parallelize([2, 4, 6, 8, 1])
print(rdd1.intersection(rdd2).collect())
# [1, 2, 4, 6]
# 11.cartesian 生成笛卡尔积
rdd = sc.parallelize([1, 3, 5])
print(rdd.cartesian(rdd).collect())
# [(1, 1), (1, 3), (1, 5), (3, 1), (3, 3), (3, 5), (5, 1), (5, 3), (5, 5)]
# 12.zip 拉链合并 ,需要两个相同长度以及分区数量
x = sc.parallelize(range(0, 5))
y = sc.parallelize(range(1000, 1005))
print(x.collect())
print(y.collect())
print(x.zip(y).collect())
# [0, 1, 2, 3, 4]
# [1000, 1001, 1002, 1003, 1004]
# [(0, 1000), (1, 1001), (2, 1002), (3, 1003), (4, 1004)]
# 13.zipWithIndex 将RDD和一个从0开始的递增序列按照拉链方式连接
rdd_name = sc.parallelize(["hive", "spark", "hbase", "hdfs"])
rdd_index = rdd_name.zipWithIndex()
print(rdd_index.collect())
# [('hive', 0), ('spark', 1), ('hbase', 2), ('hdfs', 3)]
# 14.groupByKey 按照KEY来聚合数据
rdd = sc.parallelize([("a", 1), ("b", 1), ("a", 1)])
print(rdd.collect())
print(rdd.groupByKey().mapValues(len).collect())
print(rdd.groupByKey().mapValues(list).collect())
# [('a', 1), ('b', 1), ('a', 1)]
# [('b', 1), ('a', 2)]
# [('b', [1]), ('a', [1, 1])]
# 15.sortByKey(True, 2) True 升序,Fales 倒序
rdd = sc.parallelize([("a", 1), ("b", 2), ("1", 3),("c", 1)])
print(rdd.sortByKey(False, 2).collect())
# [('1', 3), ('a', 1), ('b', 2), ('c', 1)]
# 16.join
x = sc.parallelize([('a', 1), ('b', 3)])
y = sc.parallelize([('a', 2), ('c', 1), ('a', 3)])
print(x.join(y).collect())
# [('a', (1, 2)), ('a', (1, 3))]
# 17.leftOutJoin/rightOutJoin
x = sc.parallelize([('a', 1), ('b', 2)])
y = sc.parallelize([('a', 2)])
print(x.leftOuterJoin(y).collect())
# [('a', (1, 2)), ('b', (2, None))]
标签:sc,parallelize,collect,pyspark,Transform,rdd,算子,print,原始数据
From: https://www.cnblogs.com/whiteY/p/17767844.html