原始数据—>数据特征工程(训练数据和测试数据)—>建立模型—>模型评估(测试数据进行评估)—>判断模型是否合格(不合格继续进行训练,算法学习)—>模型应用
//1、构建spark环境val spark: SparkSession = SparkSession.builder().appName("person").master("local").config("spark,sql.shuffle.partitions", "2").getOrCreate()import spark.implicits._import org.apache.spark.sql.functions._//2、读取被预处理后的人体SVM格式的数据val data: DataFrame = ad.format("libsvm").load("data/人体指标.txt")//data.show(false) //查看数据,false可以看全//3、将数据拆分为训练集和测试集:训练集0.7,测试集0.3val splitDS: Array[Dataset[Row]] = data.randomSplit(Array(0.7, 0.3))val tran: Dataset[Row] = splitDS(0) //训练集val test: Dataset[Row] = splitDS(1) //测试集//4、选择算法,这里是二分类问题,可以采用逻辑回归算法val regression: LogisticRegression = new LogisticRegression().setFitIntercept(true) //设置截距.setMaxIter(100) //设置最大迭代次数//5、将训练集带入算法训练模型val model: LogisticRegressionModel = regression.fit(tran)//6、模型的评估:会在最后增加一列表示预测的结果val testDF: DataFrame = ansform(test)//testDF.show(10000,false) //查看数据,10000表示行数//7、对预测的结果和表中原来的结果继续比较,如果相等置1相加,不相等置0,最后求出准备的比例(相等的/总数)val p: DataFrame = testDF.select(sum(when($"label" === $"prediction", value = "1.0").otherwise(value = "0.0")) / count($"label") as "p")//p.show() //查看准确率//8、如果模型准确率达标,将模型保存model.save("data/personModel")//9、模型的使用:找到路径下保存的模型直接使用//val model: LogisticRegressionModel = LogisticRegressionModel.load("data/personModel")
//1、构建spark环境val spark: SparkSession = SparkSession.builder().appName("person").master("local").config("spark,sql.shuffle.partitions", "2").getOrCreate()import spark.implicits._import org.apache.spark.sql.functions._//2、加载模型val model: LogisticRegressionModel = LogisticRegressionModel.load("data/personModel")/*** 一条数据* 0 1:5.3 2:3.5 3:2.5 4:106.4 5:67.5 6:69.1 7:83*/val vector: linalg.Vector = Vectors.dense(Array(5.3,3.5,2.5,106.4,67.5,69.1,83))//3、预测val result: Double = model.predict(vector)println(result) //打印预测结果
//1、构建spark环境val spark: SparkSession = SparkSession.builder().appName("Kmeans").master("local").config("spark,sql.shuffle.partitions", "2").getOrCreate()import spark.implicits._//2、读取数据val data: DataFrame = ad.format("csv").schema("x DOUBLE,y DOUBLE").load("")//3、将每行数据拼接成数组val ds: Dataset[(Double, Double)] = data.as[(Double, Double)] //每行转为元组val vectorDF: DataFrame = ds.map(kv => Array(kv._1, kv._2)).toDF("features") //必须要加这个列名,不然无法获取数据会报错//4、构建Kmeans算法val Kmeans: KMeans = new KMeans().setK(3) //k的数量,聚类中心的数量//5、迭代计算训练模型val model: KMeansModel = Kmeans.fit(vectorDF)//6、计算结果val result: DataFrame = ansform(vectorDF)result.show(100000) //查看结果
//1、构建spark环境val spark: SparkSession = SparkSession.builder().appName("image").master("local[*]").config("spark,sql.shuffle.partitions", "2").getOrCreate()import spark.implicits._import org.apache.spark.sql.functions._//2、读取图片文件val imageData: DataFrame = ad.format("image").load("data/train").repartition(64) //重分区/*** root* |-- image: struct (nullable = true)* | |-- origin: string (nullable = true) 文件名* | |-- height: integer (nullable = true) 高度* | |-- width: integer (nullable = true) 宽度* | |-- nChannels: integer (nullable = true)* | |-- mode: integer (nullable = true)* | |-- data: binary (nullable = true) 数据*///imageData.printSchema() //查看结构//3、提取文件的名称以及文件的数据val data: DataFrame = imageData.select($igin" as "name", $"image.data" as "data")//4、数据预处理val nameandfeatures: DataFrame = data.as[(String, Array[Byte])] //DataFrame不能map,DataSet才可map.map(kv => {val name: String = kv._1.split("/").last //文件名称val value: Array[Byte] = kv._2 //文件数据val newdata: Array[Double] = value.map(_.toDouble) //将数据转为Double类型.map(p => { //将像素小于0的全部置1.0,其他的置0.0if (p < 0) {1.0} else {0.0}})val sparse: linalg.Vector = Vectors.dense(newdata) //考虑转换为稀疏向量,节省存储空间。不可行,后面会导致长度不一(name, sparse)}).toDF("name", "features") //重新转为DataFrame//5、读取图片标签val labelData: DataFrame = ad.format("csv").option("sep", " ").schema("name String,label Double") //名称必须设为label.load("")//6、特征数据和标签数据进行关联val resultData: DataFrame = nameandfeatures.join(labelData.hint("broadcast"), List("name"), "inner") //$"name"===$"name"会报错,无法识别是哪一个name,所以用List("name").select("label", "features")//7、保存为de(SaveMode.Overwrite).format("libsvm").save("data/images")
//1、构建spark环境val spark: SparkSession = SparkSession.builder().appName("image").master("local[*]").config("spark,sql.shuffle.partitions", "2").getOrCreate()import spark.implicits._import org.apache.spark.sql.functions._//2、读取数据val data: DataFrame = ad.format("libsvm").load("data/images")//3、拆分训练集、测试集val array: Array[Dataset[Row]] = data.randomSplit(Array(0.7, 0.3))val train: Dataset[Row] = array(0) //训练集val test: Dataset[Row] = array(1) //测试集//4、构建算法val regression: LogisticRegression =new LogisticRegression() //逻辑回归.setFitIntercept(true) //设置截距.setMaxIter(100) //设置最大迭代次数//5、将数据带入算法训练模型val model: LogisticRegressionModel = regression.fit(train)val result: DataFrame = ansform(test)//6、计算准确率result.select(sum(when($"label" === $"prediction", 1).otherwise(0)) / count($"label") as "rate").show()//7、保存模型model.write.overwrite().save("data/imageModel")
//1、构建spark环境val spark: SparkSession = SparkSession.builder().appName("image").master("local[*]").config("spark,sql.shuffle.partitions", "2").getOrCreate()import spark.implicits._import org.apache.spark.sql.functions._//2、加载模型val model: LogisticRegressionModel = LogisticRegressionModel.load("data/imageModel")//3、读取图片val imageData: DataFrame = ad.format("image").load("data/27550.jpg")val data: DataFrame = imageData.select($igin" as "name", $"image.data" as "data")//4、提取特征val nameandfeatures: DataFrame = data.as[(String, Array[Byte])] //DataFrame不能map,DataSet才可map.map(kv => {val name: String = kv._1 //文件名称val value: Array[Byte] = kv._2 //文件数据val newdata: Array[Double] = value.map(_.toDouble) //将数据转为Double类型.map(p => { //将像素小于0的全部置1.0,其他的置0.0if (p < 0) {1.0} else {0.0}})val sparse: linalg.Vector = Vectors.dense(newdata)(name, sparse)}).toDF("name", "features")//5、得出结果ansform(nameandfeatures).show()
object Demo08IK {def main (args: Array[String]): Unit = {val str = "别人笑我太疯癫,我笑他人看不穿;不见五陵豪杰墓,无花无酒锄作田。"val words: List[String] = fit(str)println(words)}def fit (text: String): List[String] = {val listBuffer: ListBuffer[String] = new ListBuffer[String]val sr: StringReader = new StringReader(text)val ik: IKSegmenter = new IKSegmenter(sr, true)var word: Lexeme = ik.next()while (word != null) {listBuffer += LexemeTextword = ik.next()}List}
}
//1、构建spark环境val spark: SparkSession = SparkSession.builder().appName("person").master("local[*]").config("spark,sql.shuffle.partitions", "2").getOrCreate()import spark.implicits._import org.apache.spark.sql.functions._//2、读取文本数据val data: DataFrame = ad.format("csv").option("sep", "t").schema("label DOUBLE,text STRING").load("")//3、分词val wordsDS: Dataset[(Double, List[String])] = data.as[(Double, String)].map(kv => {val label: Double = kv._1val text: String = kv._2val words: List[String] = Demo08IK.fit(text) //调用IK分词器进行分词(label, words)})//wordsDS.show(1000,false)//4、去除脏数据val filterDS: Dataset[(Double, List[String])] = wordsDS.filter(_._2.length > 2)//5、将集合中每一个词语使用空格拼接val linesDF: DataFrame = filterDS.map(kv => {(kv._1, kv._2.mkString(" "))}).toDF("label", "text")//6、使用官方提供的英文分词器,转为空格拼接也是为了方便使用英文分词器val tokenizer: Tokenizer = new Tokenizer().setInputCol("text").setOutputCol("words")val wordsData: DataFrame = ansform(linesDF)//7、计算TF(词频)和IDF(逆文本频率)val hashingTF: HashingTF = new HashingTF().setInputCol("words").setOutputCol("rawFeatures")val featurizedData: DataFrame = ansform(wordsData)val idf: IDF = new IDF().setInputCol("rawFeatures").setOutputCol("features") //IDFval idfModel: IDFModel = idf.fit(featurizedData) //训练IDF模型val rescaledData: DataFrame = ansform(featurizedData) ////8、将结果拆分训练集和测试集val array: Array[Dataset[Row]] = rescaledData.randomSplit(Array(0.7, 0.3))val train: Dataset[Row] = array(0)val test: Dataset[Row] = array(1)//9、采用贝叶斯分类做问文本分类val model: NaiveBayesModel = new NaiveBayes().fit(train)val dataFrame: DataFrame = ansform(test)//10、计算准确率dataFrame.select(sum(when($"label" === $"prediction", 1).otherwise(0)) / count($"label") as "rate").show()//11、保存IDF模型和贝叶斯模型idfModel.write.overwrite().save("data/idfModel")model.write.overwrite().save("data/naiveBayes")
附上需要文件
本文发布于:2024-02-01 01:56:15,感谢您对本站的认可!
本文链接:https://www.4u4v.net/it/170672377833015.html
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系,我们将在24小时内删除。
留言与评论(共有 0 条评论) |