将代码提交到yarn上运行

来源:13-14 -将作业运行到YARN上

慕九州8702158

2020-09-23

from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import udf


def get_grade(value):
    if value <= 50 and value >= 0:
        return "健康"
    elif value <= 100:
        return "中等"
    elif value <= 150:
        return "对敏感人群不健康"
    elif value <= 200:
        return "不健康"
    elif value <= 300:
        return "非常不健康"
    elif value <= 500:
        return "危险"
    elif value > 500:
        return "爆表"
    else:
        return None


if __name__ == '__main__':
    spark = SparkSession.builder.appName("ww_Pro").getOrCreate()
    data2015 = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/data/Beijing_2015_HourlyPM25_created20160201.csv").select("Year","Month","Day","Hour","Value","QC Name")
    data2016 = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/data/Beijing_2016_HourlyPM25_created20170201.csv").select("Year","Month","Day","Hour","Value","QC Name")
    data2017 = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/data/Beijing_2017_HourlyPM25_created20170803.csv").select("Year","Month","Day","Hour","Value","QC Name")

    grade_function_udf = udf(get_grade, StringType())

    # 对grade字段进行统计:
    group2015 = data2015.withColumn("Grade", grade_function_udf(data2015['value'])).groupBy("Grade").count()
    group2016 = data2016.withColumn("Grade", grade_function_udf(data2016['value'])).groupBy("Grade").count()
    group2017 = data2017.withColumn("Grade", grade_function_udf(data2017['value'])).groupBy("Grade").count()

    #统计grade字段每种标签的占比情况:
    group2015.select("Grade", "count", group2015["count"]/data2015.count()).show()
    group2016.select("Grade", "count", group2016["count"]/data2016.count()).show()
    group2017.select("Grade", "count", group2017["count"]/data2017.count()).show()


    spark.stop()

报错如下:
图片描述

写回答

1回答

Michael_PK

2020-09-23

你的提交脚本我看下

0
1
慕九州8702158
没问题了
2020-09-24
共1条回复

Python3实战Spark大数据分析及调度

使用Python3对Spark应用程序进行开发调优,掌握Azkaban任务调度

1046 学习 · 434 问题

查看课程