交集、并集、差集
交集会产生shuffle操作
差集
/**
* 交集
*/
@Test
def intersection(): Unit ={
val rdd1 = sc.parallelize(List(1,2,3,4,5))
val rdd2 = sc.parallelize(List(4,5,6,7,8,9))
val rdd3 = rdd1.intersection(rdd2)
println(rdd3.collect().toList)
}
/**
* 差集
*/
@Test
def subtract(): Unit ={
val rdd1 = sc.parallelize(List(1,2,3,4,5))
val rdd2 = sc.parallelize(List(4,5,6,7,8,9))
val rdd3 = rdd1.subtract(rdd2)
println(rdd3.collect().toList)
}
@Test
def union(): Unit ={
val rdd1 = sc.parallelize(List(1,2,3,4,5))
val rdd2 = sc.parallelize(List(4,5,6,7,8,9))
val rdd3 = rdd1.union(rdd2)
println(rdd3.partitions.length)
println(rdd3.collect().toList)
}
zip拉链
/**
* spark的拉链必须要求两个RDD的分区数以及元素个数一致
*/
@Test
def zip(): Unit ={
val rdd1 = sc.parallelize(List("zhangsan","lisi","wangwu"),5)
val rdd2 = sc.parallelize(List(20,30,40))
val rdd3 = rdd1.zip(rdd2)
println(rdd3.collect().toList)
}
join
一个join会产生两个shuffler,用广播变量减少shuffler
union
union并集没有shuffle,RDD分区数=union的两个rdd分区数总和