$ pip install pyspark
$ pyspark
$ docker run -it --rm spark:python3 /opt/spark/bin/pyspark
df = spark.read.json("logs.json") df.where("age > 21").select("name.first").show()
# Every record contains a label and feature vector df = spark.createDataFrame(data, ["label", "features"]) # Split the data into train/test datasets train_df, test_df = df.randomSplit([.80, .20], seed=42) # Set hyperparameters for the algorithm rf = RandomForestRegressor(numTrees=100) # Fit the model to the training data model = rf.fit(train_df) # Generate predictions on the test dataset. model.transform(test_df).show()
df = spark.read.csv("accounts.csv", header=True) # Select subset of features and filter for balance > 0 filtered_df = df.select("AccountBalance", "CountOfDependents").filter("AccountBalance > 0") # Generate summary statistics filtered_df.summary().show()
$ docker run -it --rm spark /opt/spark/bin/spark-sql
spark-sql>
SELECT name.first AS first_name, name.last AS last_name, age FROM json.`logs.json` WHERE age > 21;
$ docker run -it --rm spark /opt/spark/bin/spark-shell
scala>
val df = spark.read.json("logs.json") df.where("age > 21") .select("name.first").show()
Dataset df = spark.read().json("logs.json"); df.where("age > 21") .select("name.first").show();
$ docker run -it --rm spark:r /opt/spark/bin/sparkR
>
df <- read.json(path = "logs.json") df <- filter(df, df$age > 21) head(select(df, df$name.first))
Spark SQL 在运行时调整执行计划,例如自动设置 reducer 数量和连接算法。
使用您已经熟悉的相同 SQL。
Spark SQL 可处理结构化表格和非结构化数据,例如 JSON 或图像。