Spark 目前支持 Hash 分区和 Range 分区,和用户自定义分区。Hash 分区为当前的默认分区。分区器直接决定了 RDD 中分区的个数、RDD 中每条数据经过 Shuffle 后进入哪个分区,进而决定了 Reduce 的个数。
    ➢ 只有 Key-Value 类型的 RDD 才有分区器,非 Key-Value 类型的 RDD 分区的值是 None
    ➢ 每个 RDD 的分区 ID 范围:0 ~ (numPartitions - 1),决定这个值是属于那个分区的。
    1) Hash 分区:对于给定的 key,计算其 hashCode,并除以分区个数取余

    1. class HashPartitioner(partitions: Int) extends Partitioner {
    2. require(partitions >= 0, s"Number of partitions ($partitions) cannot be
    3. negative.")
    4. def numPartitions: Int = partitions
    5. def getPartition(key: Any): Int = key match {
    6. case null => 0
    7. case _ => Utils.nonNegativeMod(key.hashCode, numPartitions)
    8. }
    9. override def equals(other: Any): Boolean = other match {
    10. case h: HashPartitioner =>
    11. h.numPartitions == numPartitions
    12. case _ =>
    13. false
    14. }
    15. override def hashCode: Int = numPartitions
    16. }

    2) Range 分区:将一定范围内的数据映射到一个分区中,尽量保证每个分区数据均匀,而且分区间有序

    1. class RangePartitioner[K : Ordering : ClassTag, V](
    2. partitions: Int,
    3. rdd: RDD[_ <: Product2[K, V]],
    4. private var ascending: Boolean = true)
    5. extends Partitioner {
    6. // We allow partitions = 0, which happens when sorting an empty RDD under the
    7. default settings.
    8. require(partitions >= 0, s"Number of partitions cannot be negative but found
    9. $partitions.")
    10. private var ordering = implicitly[Ordering[K]]
    11. // An array of upper bounds for the first (partitions - 1) partitions
    12. private var rangeBounds: Array[K] = {
    13. ...
    14. }
    15. def numPartitions: Int = rangeBounds.length + 1
    16. private var binarySearch: ((Array[K], K) => Int) =
    17. CollectionsUtils.makeBinarySearch[K]
    18. def getPartition(key: Any): Int = {
    19. val k = key.asInstanceOf[K]
    20. var partition = 0
    21. if (rangeBounds.length <= 128) {
    22. // If we have less than 128 partitions naive search
    23. while (partition < rangeBounds.length && ordering.gt(k,
    24. rangeBounds(partition))) {
    25. partition += 1
    26. }
    27. } else {
    28. // Determine which binary search method to use only once.
    29. partition = binarySearch(rangeBounds, k)
    30. // binarySearch either returns the match location or -[insertion point]-1
    31. if (partition < 0) {
    32. partition = -partition-1
    33. }
    34. if (partition > rangeBounds.length) {
    35. partition = rangeBounds.length
    36. }
    37. }
    38. if (ascending) {
    39. partition
    40. } else {
    41. rangeBounds.length - partition
    42. }
    43. }
    44. override def equals(other: Any): Boolean = other match {
    45. ...
    46. }
    47. override def hashCode(): Int = {
    48. ...
    49. }
    50. @throws(classOf[IOException])
    51. private def writeObject(out: ObjectOutputStream): Unit =
    52. Utils.tryOrIOException {
    53. ...
    54. }
    55. @throws(classOf[IOException])
    56. private def readObject(in: ObjectInputStream): Unit = Utils.tryOrIOException
    57. {
    58. ...
    59. } }