安装 Spark
安装 Java 8+: https://spark.apache.org/docs/latest/index.html
安装 Spark: https://spark.apache.org/downloads.html
./spark-shell --version
代码
Spark 依赖: https://spark.apache.org/docs/latest/quick-start.html#self-contained-applications
<dependency> <!-- Spark dependency -->
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.12</artifactId>
<version>3.5.2</version>
<scope>provided</scope>
</dependency>
统计单词代码
/**
* SPARK_HOME=/Users/liaozibo/app/spark-3.5.2-bin-hadoop3
* $SPARK_HOME/bin/spark-submit --class "WorkCount" --master "local[*]" target/spark-demo-1.0-SNAPSHOT.jar
* */
public class WorkCount {
public static void main(String[] args) {
SparkSession spark = SparkSession.builder()
.getOrCreate();
JavaRDD<String> lineRdd = spark.read().textFile("/Users/liaozibo/code/demo/spark-demo/spark-wiki.txt").javaRDD();
JavaRDD<String> wordRdd = lineRdd.flatMap(line -> Arrays.asList(line.split(" ")).iterator());
JavaRDD<String> cleanWordRdd = wordRdd.filter(word -> !word.trim().isEmpty());
List<Tuple2<Integer, String>> top5 = cleanWordRdd.mapToPair(word -> new Tuple2<>(word, 1))
.reduceByKey(Integer::sum) // 按Key统计
.mapToPair(Tuple2::swap)
.sortByKey(false) // 按Key排序
.take(5);
top5.forEach(System.out::println);
spark.stop();
}
}
执行
打包代码 mvn package
执行代码
SPARK_HOME=/Users/liaozibo/app/spark-3.5.2-bin-hadoop3
$SPARK_HOME/bin/spark-submit --class "WorkCount" --master "local[*]" target/spark-demo-1.0-SNAPSHOT.jar
标签:01,--,WorkCount,apache,org,spark,Spark
From: https://www.cnblogs.com/liaozibo/p/18397388