def_calculate_centroids(cluster:ListBuffer[(Int,DenseVector[Double])]):List[DenseVector[Double]]= { cluster.groupBy(_._1).map { x => val temp = x._2.map(_._2) temp.reduce((a, b) => a :+ b).map(_ / temp.length) }.toList }
/** * @param x input matrix * @return predict vector value */ overridedefpredict(x: DenseMatrix[Double]): DenseVector[Double] = { DenseVector[Double]() } }
objectKmeans{ defmain(args: Array[String]): Unit = { val irisData = Data.irisData val data = irisData.map(_.slice(0,4)).map(DenseVector(_)).toList val kmeans = newKmeans(max_iter = 100) kmeans.train(data) println(kmeans.centroids)
var means:Array[DenseVector[Double]] = newArray[DenseVector[Double]](k) var vars:Array[DenseMatrix[Double]] = newArray[DenseMatrix[Double]](k) var sample_assignments:DenseVector[Int] = _ var priors:DenseVector[Double] = _ var responsibility:DenseMatrix[Double] = _ var responsibilities:ArrayBuffer[DenseVector[Double]] = newArrayBuffer[DenseVector[Double]]()
def_initialize(X:DenseMatrix[Double]): Unit ={ val n_samples = X.rows valX_lst = (0 until n_samples).map(X.t(::,_)) val rng = new scala.util.Random() priors = DenseVector.ones[Double](k) :/ k.toDouble means = rng.shuffle(X_lst).take(k).toArray // means = X_lst.take(k).toArray for(i <- 0 until k){ vars(i) = MatrixUtils.calculate_covariance_matrix(X) } }
defmultivariate_gaussian(X:DenseMatrix[Double],i:Int):DenseVector[Double]={ val n_features = X.cols val mean = means(i) val covar = vars(i) val determinant = det(covar) val likelihoods = DenseVector.zeros[Double](X.rows) valX_arr = (0 until X.rows).map(X.t(::,_))
for((sample,index) <- X_arr.zipWithIndex){ val d = n_features val coeff = 1.0 / (math.pow(2 * Math.PI,d/2) * math.sqrt(determinant)) val gram = (sample :- mean).t * pinv(covar) * (sample :- mean) val exponent = math.exp(-0.5 * gram) likelihoods(index) = coeff * exponent } likelihoods }
def_get_likelihoods(X:DenseMatrix[Double]):DenseMatrix[Double] = { val n_samples = X.rows val likelihoods = DenseMatrix.zeros[Double](n_samples,k) for(i <- 0 until k){ likelihoods(::,i) := multivariate_gaussian(X,i) } likelihoods }
def_expectation(X:DenseMatrix[Double]): Unit ={ val weighted_likelihoods = _get_likelihoods(X)(*,::).map(x => x :* priors) val sum_likelihoods = sum(weighted_likelihoods,Axis._1) responsibility = weighted_likelihoods(::,*).map(x => x :/ sum_likelihoods) // 列除 sample_assignments = argmax(responsibility,Axis._1) responsibilities.append(max(responsibility,Axis._1)) }
def_maximization(X:DenseMatrix[Double]): Unit ={ for(i <- 0 until k){ val resp = responsibility(::,i) val mean = sum(X(::,*).map(f => resp :* f),Axis._0) :/ sum(resp) means(i) = mean.t val diff = X(*,::).map(f => f :- mean.t) val covariance = diff.t * diff(::,*).map(f => f :* resp) :/sum(resp) // 注意diff(::,*)是取列运算 vars(i) = covariance } val n_samples = X.rows priors = sum(responsibility,Axis._0).t :/ n_samples.toDouble }
defpredict(X:DenseMatrix[Double]): DenseVector[Double] = { _initialize(X) var iter = 0 var flag = true for (_ <- 0 until max_iterations if flag) { iter += 1 _expectation(X) _maximization(X) breakable { if (responsibilities.length < 2) { break() }else{ val n = responsibilities.length val diff = norm(responsibilities(n-1) - responsibilities(n-2), 2) println(diff) if (diff <= tolerance) flag = false } } } logger.info(s"$iter 之后收敛") _expectation(X) sample_assignments.map(_.toDouble) }
}
objectGMM{ defmain(args: Array[String]): Unit = {
val irisData = Data.irisData val data = irisData.map(_.slice(0,4)).toList val target = irisData.map(_.apply(4))
val gmm = newGMM(max_iterations = 100) gmm._initialize(DenseMatrix(data:_*))
val pred = gmm.predict(DenseMatrix(data:_*)) println(pred)
/** * Partitioning (clustering) of the data into k clusters “around medoids”, a more robust version of K-means. * * @param k nums of cluster */
classPAM(k:Int = 2,seed:Long = 1234L) {
def_init_random_medoids(X: DenseMatrix[Double]): IndexedSeq[DenseVector[Double]] ={ val n_samples = X.rows val data = (0 until n_samples).map(X.t(::,_)) val rng = newRandom(seed) rng.shuffle(data).take(k) }
def_closet_medoid(sample:DenseVector[Double],medoids:IndexedSeq[DenseVector[Double]]): Int ={ val distWithIndex = medoids.zipWithIndex.map(x => (squaredDistance(x._1,sample),x._2) ).minBy(_._1) distWithIndex._2 }
def_create_clusters(X:DenseMatrix[Double],medoids:IndexedSeq[DenseVector[Double]]): Array[Array[Int]] ={ val clusterss = newArray[Int](X.rows) val data = (0 until X.rows).map(X.t(::,_)) for((sample,inx) <- data.zipWithIndex){ val medoid_i = _closet_medoid(sample,medoids) clusterss(inx) = medoid_i } clusterss.zipWithIndex.groupBy(_._1).toArray.sortBy(_._1).map(_._2.map(_._2)) }
def_calculate_cost(X:DenseMatrix[Double],clusters:Array[Array[Int]],medoids:IndexedSeq[DenseVector[Double]]):Double={ var cost = 0.0 val data = (0 until X.rows).map(X.t(::,_)) for((cluster,i) <- clusters.zipWithIndex){ val medoid = medoids(i) for(sample_i <- cluster){ cost += squaredDistance(data(sample_i),medoid) } } cost }
def_get_cluster_labels(clusters:Array[Array[Int]],X:DenseMatrix[Double]): Array[Int] ={ val y_pred = Array.fill(X.rows)(0) for(cluster_i <- 0 until clusters.length){ val cluster = clusters(cluster_i) for(sample_i <- cluster){ y_pred(sample_i) = cluster_i } } y_pred } def_get_no_medoids(X:DenseMatrix[Double],medoids:IndexedSeq[DenseVector[Double]]): Array[DenseVector[Double]] ={ val non_medoids:ArrayBuffer[DenseVector[Double]] = newArrayBuffer[DenseVector[Double]]() val data = (0 until X.rows).map(X.t(::,_))
objectPAM{ defmain(args: Array[String]): Unit = { val irisData = Data.irisData val data = irisData.map(_.slice(0,4)).toList val target = irisData.map(_.apply(4))
val pam = newPAM(k = 3)
val pred = pam.predict(DenseMatrix(data:_*)) println(pred.toList) val acc = Metric.accuracy(pred.map(_.toDouble),target) * 100 println(f"准确率为: $acc%-5.2f%%") } }
deffit(X: DenseMatrix[Double], y: DenseVector[Int]) = { p = DenseMatrix.zeros[Double](q, X.cols) for (i <- 0 until q) { C += (i -> ArrayBuffer[Int]()) val candidate_indices = y.toArray.indices.filter(f => y(f) == t(i)) val target_indice = Random.shuffle(candidate_indices.toList).take(1).apply(0) p(i, ::) := X(target_indice, ::) }
var p_arr = (0 until p.rows).map(p.t(::, _)) for (_ <- 0 until nums_iters) { val j = Random.shuffle(Range(0, y.length).toList).take(1).apply(0) val x_j = X(j, ::).t val d = p_arr.map(f => euclidean_distance(f, x_j)) val idx: Int = argmin(d.toArray) if (y(j) == t(idx)) { p(idx, ::) := p(idx, ::) :+ ((X(j, ::) :- p(idx, ::)) :* lr) // :+ 和 :* 运算优先级一致 } else { p(idx, ::) := p(idx, ::) :- ((X(j, ::) :- p(idx, ::)) :* lr) }
} p_arr = (0 until p.rows).map(p.t(::, _)) for (j <- 0 until X.rows) { val d = p_arr.map(f => euclidean_distance(f, X(j, ::).t)) val idx: Int = argmin(DenseVector(d.toArray)) C(idx).append(j) }
labels = DenseVector.zeros[Int](X.rows) for (i <- 0 until q) { for (j <- C(i)) { labels(j) = i } } }
defpredict(X: DenseMatrix[Double]): DenseVector[Int] = { val p_arr = (0 until p.rows).map(p.t(::, _)) val preds_y: ArrayBuffer[Int] = newArrayBuffer[Int]() for (j <- 0 until X.rows) { val d = p_arr.map(f => euclidean_distance(f, X(j, ::).t)) val idx: Int = argmin(DenseVector(d.toArray)) preds_y.append(t(idx)) } DenseVector(preds_y.toArray) } }
objectHierarchicalCluster{ defmain(args: Array[String]): Unit = { val irisData = Data.irisData val data = irisData.map(_.slice(0,4)) val dd = DenseMatrix(data:_*) val hc = newHierarchicalCluster(k=3) hc.fit(dd) println(hc.labels) } }
defshiftPoint(point:DenseVector[Double],points:List[DenseVector[Double]],kernel_bandwidth:Double):DenseVector[Double] ={ val distance = points.map(t => point - t) val point_weights = gaussian_kernel(DenseMatrix(distance:_*),kernel_bandwidth) val tiled_weight = tile(point_weights,1,point.length) val denominator = sum(point_weights) sum(tiled_weight :* DenseMatrix(points:_*),Axis._0).t / denominator //各个元素对应相乘 }
defgroup_points(points:List[DenseVector[Double]]):List[Int] = { val cluster_ids:ArrayBuffer[Int] = newArrayBuffer[Int]() var cluster_idx = 0 val cluster_centers:ArrayBuffer[DenseVector[Double]] = newArrayBuffer[DenseVector[Double]]()
deffit(points:List[DenseVector[Double]]):List[Int]={ val shift_points = points var max_min_dist = 1.0 var iteration_number = 0 val still_shifting:Array[Boolean] = Array.fill(points.length)(true) valMIN_DISTANCE = 1e-5 while (max_min_dist > MIN_DISTANCE) { max_min_dist = 0.0 iteration_number += 1
for (i <- 0 until shift_points.length) { breakable { if (!still_shifting(i)) { break } } var p_new = shift_points(i).copy val p_new_start = p_new.copy p_new = shiftPoint(p_new, points, kernel_bandwidth) val dist = euclidean_distance(p_new, p_new_start) if(dist > max_min_dist){ max_min_dist = dist } if(dist < MIN_DISTANCE){ still_shifting(i) = false } shift_points(i) := p_new } } group_points(shift_points) }
}
objectMeanShift{
defmain(args: Array[String]): Unit = { val irisData = Data.irisData val data = irisData.map(_.slice(0,4)).map(DenseVector(_)).toList val ms = newMeanShift(kernel_bandwidth = 0.3) val label = ms.fit(data) println(label) }