缘起
最近阅读<<我的第一本算法书>>(【日】石田保辉;宫崎修一)
本系列笔记拟采用golang练习之
k-means聚类算法
聚类就是在输入为多个数据时,将“相似”的数据分为一组的操作。k-means算法是聚类算法中的一种。首先随机选择k个点作为簇的中心点,然后重复执行“将数据分到相应的簇中”和“将中心点移到重心的位置”这两个操作,直到中心点不再发生变化为止。k-means算法中,随着操作的不断重复,中心点的位置必定会在某处收敛,这一点已经在数学层面上得到证明。摘自 <<我的第一本算法书>> 【日】石田保辉;宫崎修一
场景
- 某地突然爆发新冠疫情, 现防疫急需根据病例分布, 查找可能的病源地
- 首先将病例分布的坐标, 录入系统
- 然后根据k-means算法, 按k从1到3, 分别进行聚类
- 聚类的中心点, 可能就是病源地

流程
- 给定若干样本, 和样本距离计算器, 需要求解k个样本中心点
- 首先从样本中随机抽取k个点, 作为中心点
- 循环每个样本
- 分别计算样本点到k个中心点的距离
- 判断距离样本点最小的中心点
- 将样本划分到该最小中心点的簇
- 计算每个簇的中心点, 作为新的中心点
- 循环簇中的每个样本
- 计算该样本, 到本簇其他样本的距离之和
- 与其他样本的距离和最小的点, 就是新的中心点
- 重复3-4, 直到中心点不再变化, 计算完毕
设计
- IPoint: 样本点接口, 其实是一个空接口
- IDistanceCalculator: 距离计算器接口
- IClassifier: 分类器接口, 将samples聚类成k个, 并返回k个中心点
- tPerson: 病例样本点, 实现IPoint接口, 含x,y坐标
- tPersonDistanceCalculator: 病例距离计算器, 计算两点间x,y坐标的直线距离
- tKMeansClassifier: k-means聚类器, 实现IClassifier接口.
单元测试
k_means_test.go
package othersimport (km "learning/gooop/others/k_means""strings""testing")func Test_KMeans(t *testing.T) {// 创建样本点samples := []km.IPoint {km.NewPerson(2, 11),km.NewPerson(2, 8),km.NewPerson(2, 6),km.NewPerson(3, 12),km.NewPerson(3, 10),km.NewPerson(4, 7),km.NewPerson(4, 3),km.NewPerson(5, 11),km.NewPerson(5, 9),km.NewPerson(5, 2),km.NewPerson(7, 9),km.NewPerson(7, 6),km.NewPerson(7, 3),km.NewPerson(8, 12),km.NewPerson(9, 3),km.NewPerson(9, 5),km.NewPerson(9, 10),km.NewPerson(10, 3),km.NewPerson(10, 6),km.NewPerson(10, 12),km.NewPerson(11, 9),}fnPoints2String := func(points []km.IPoint) string {items := make([]string, len(points))for i,it := range points {items[i] = it.String()}return strings.Join(items, " ")}for k:=1;k<=3;k++ {centers := km.KMeansClassifier.Classify(samples, km.PersonDistanceCalculator, k)t.Log(fnPoints2String(centers))}}
测试输出
$ go test -v k_means_test.go=== RUN Test_KMeansk_means_test.go:53: p(7,6)k_means_test.go:53: p(5,9) p(7,3)k_means_test.go:53: p(9,10) p(3,10) p(7,3)--- PASS: Test_KMeans (0.00s)PASSok command-line-arguments 0.002s
IPoint.go
样本点接口, 其实是一个空接口
package kmimport "fmt"type IPoint interface {fmt.Stringer}
IDistanceCalculator.go
距离计算器接口
package kmtype IDistanceCalculator interface {Calc(a, b IPoint) int}
IClassifier.go
分类器接口, 将samples聚类成k个, 并返回k个中心点
package kmtype IClassifier interface {// 将samples聚类成k个, 并返回k个中心点Classify(samples []IPoint, calc IDistanceCalculator, k int) []IPoint}
tPerson.go
病例样本点, 实现IPoint接口, 含x,y坐标
package kmimport "fmt"type tPerson struct {x inty int}func NewPerson(x, y int) IPoint {return &tPerson{x, y, }}func (me *tPerson) String() string {return fmt.Sprintf("p(%v,%v)", me.x, me.y)}
tPersonDistanceCalculator.go
病例距离计算器, 计算两点间x,y坐标的直线距离
package kmtype tPersonDistanceCalculator struct {}var gMaxInt = 0x7fffffff_fffffffffunc newPersonDistanceCalculator() IDistanceCalculator {return &tPersonDistanceCalculator{}}func (me *tPersonDistanceCalculator) Calc(a, b IPoint) int {if a == b {return 0}p1, ok := a.(*tPerson)if !ok {return gMaxInt}p2, ok := b.(*tPerson)if !ok {return gMaxInt}dx := p1.x - p2.xdy := p1.y - p2.yd := dx*dx + dy*dyif d < 0 {panic(d)}return d}var PersonDistanceCalculator = newPersonDistanceCalculator()
tKMeansClassifier.go
k-means聚类器, 实现IClassifier接口.
package kmimport ("math/rand""time")type tKMeansClassifier struct {}type tPointEntry struct {point IPointdistance intindex int}func newPointEntry(p IPoint, d int, i int) *tPointEntry {return &tPointEntry{p, d, i,}}func newKMeansClassifier() IClassifier {return &tKMeansClassifier{}}// 将samples聚类成k个, 并返回k个中心点func (me *tKMeansClassifier) Classify(samples []IPoint, calc IDistanceCalculator, k int) []IPoint {sampleCount := len(samples)if sampleCount <= k {return samples}// 初始化, 随机选择k个中心点rnd := rand.New(rand.NewSource(time.Now().UnixNano()))centers := make([]IPoint, k)for selected, i:= make(map[int]bool, 0), 0;i < k; {n := rnd.Intn(sampleCount)_,ok := selected[n]if !ok {selected[n] = truecenters[i] = samples[n]i++}}// 根据到中心点的距离, 划分samplesfor {groups := me.split(samples, centers, calc)newCenters := make([]IPoint, k)for i,g := range groups {newCenters[i] = me.centerOf(g, calc)}if me.groupEquals(centers, newCenters) {return centers}centers = newCenters}}// 将样本点距离中心点的距离进行分簇func (me *tKMeansClassifier) split(samples []IPoint, centers []IPoint, calc IDistanceCalculator) [][]IPoint {k := len(centers)result := make([][]IPoint, k)for i := 0;i<k;i++ {result[i] = make([]IPoint, 0)}entries := make([]*tPointEntry, k)for i,c := range centers {entries[i] = newPointEntry(c, 0, i)}for _,p := range samples {for _,e := range entries {e.distance = calc.Calc(p, e.point)}center := me.min(entries)result[center.index] = append(result[center.index], p)}return result}// 计算一簇样本的重心. 重心就是距离各点的总和最小的点func (me *tKMeansClassifier) centerOf(samples []IPoint, calc IDistanceCalculator) IPoint {entries := make([]*tPointEntry, len(samples))for i,src := range samples {distance := 0for _,it := range samples {distance += calc.Calc(src, it)}entries[i] = newPointEntry(src, distance, i)}return me.min(entries).point}// 判断两组点是否相同func (me *tKMeansClassifier) groupEquals(g1, g2 []IPoint) bool {if len(g1) != len(g2) {return false}for i,v := range g1 {if g2[i] != v {return false}}return true}// 查找距离最小的点func (me *tKMeansClassifier) min(entries []*tPointEntry) *tPointEntry {minI := 0minD := gMaxIntfor i,it := range entries {if it.distance < minD {minI = iminD = it.distance}}return entries[minI]}var KMeansClassifier = newKMeansClassifier()
(end)
