将代码提交到yarn上运行
来源:13-14 -将作业运行到YARN上
慕九州8702158
2020-09-23
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import udf
def get_grade(value):
if value <= 50 and value >= 0:
return "健康"
elif value <= 100:
return "中等"
elif value <= 150:
return "对敏感人群不健康"
elif value <= 200:
return "不健康"
elif value <= 300:
return "非常不健康"
elif value <= 500:
return "危险"
elif value > 500:
return "爆表"
else:
return None
if __name__ == '__main__':
spark = SparkSession.builder.appName("ww_Pro").getOrCreate()
data2015 = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/data/Beijing_2015_HourlyPM25_created20160201.csv").select("Year","Month","Day","Hour","Value","QC Name")
data2016 = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/data/Beijing_2016_HourlyPM25_created20170201.csv").select("Year","Month","Day","Hour","Value","QC Name")
data2017 = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/data/Beijing_2017_HourlyPM25_created20170803.csv").select("Year","Month","Day","Hour","Value","QC Name")
grade_function_udf = udf(get_grade, StringType())
# 对grade字段进行统计:
group2015 = data2015.withColumn("Grade", grade_function_udf(data2015['value'])).groupBy("Grade").count()
group2016 = data2016.withColumn("Grade", grade_function_udf(data2016['value'])).groupBy("Grade").count()
group2017 = data2017.withColumn("Grade", grade_function_udf(data2017['value'])).groupBy("Grade").count()
#统计grade字段每种标签的占比情况:
group2015.select("Grade", "count", group2015["count"]/data2015.count()).show()
group2016.select("Grade", "count", group2016["count"]/data2016.count()).show()
group2017.select("Grade", "count", group2017["count"]/data2017.count()).show()
spark.stop()
报错如下:
写回答
1回答
-
你的提交脚本我看下
012020-09-24
相似问题