虽然数据导出来了,但是报错了,之前是正常的,加了saveAsTextFile后就报错了

来源:9-6 数据清洗之第一步原始日志解析

进击的大黄鸭

2017-07-14

17/07/14 11:07:58 ERROR Utils: Aborting task

java.lang.ArrayIndexOutOfBoundsException: 599

at sun.util.calendar.BaseCalendar.getCalendarDateFromFixedDate(BaseCalendar.java:453)

at java.util.GregorianCalendar.computeFields(GregorianCalendar.java:2397)

at java.util.GregorianCalendar.computeFields(GregorianCalendar.java:2312)

at java.util.Calendar.setTimeInMillis(Calendar.java:1804)

at java.util.Calendar.setTime(Calendar.java:1770)

at java.text.SimpleDateFormat.format(SimpleDateFormat.java:943)

at java.text.SimpleDateFormat.format(SimpleDateFormat.java:936)

at java.text.DateFormat.format(DateFormat.java:345)

at com.imooc.spark.DataUtils$.parse(DataUtils.scala:20)

at com.imooc.spark.SparkStatFormatJob$$anonfun$main$1.apply(SparkStatFormatJob.scala:27)

at com.imooc.spark.SparkStatFormatJob$$anonfun$main$1.apply(SparkStatFormatJob.scala:18)

at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)

at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)

at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$7.apply$mcV$sp(PairRDDFunctions.scala:1211)

at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$7.apply(PairRDDFunctions.scala:1210)

at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$7.apply(PairRDDFunctions.scala:1210)

at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1341)

at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13.apply(PairRDDFunctions.scala:1218)

at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13.apply(PairRDDFunctions.scala:1197)

at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)

at org.apache.spark.scheduler.Task.run(Task.scala:99)

at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)

at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)

at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)

at java.lang.Thread.run(Thread.java:748)

17/07/14 11:07:58 ERROR Executor: Exception in task 1.0 in stage 0.0 (TID 1)

java.lang.ArrayIndexOutOfBoundsException: 599

at sun.util.calendar.BaseCalendar.getCalendarDateFromFixedDate(BaseCalendar.java:453)

at java.util.GregorianCalendar.computeFields(GregorianCalendar.java:2397)

at java.util.GregorianCalendar.computeFields(GregorianCalendar.java:2312)

at java.util.Calendar.setTimeInMillis(Calendar.java:1804)

at java.util.Calendar.setTime(Calendar.java:1770)

at java.text.SimpleDateFormat.format(SimpleDateFormat.java:943)

at java.text.SimpleDateFormat.format(SimpleDateFormat.java:936)

at java.text.DateFormat.format(DateFormat.java:345)

at com.imooc.spark.DataUtils$.parse(DataUtils.scala:20)

at com.imooc.spark.SparkStatFormatJob$$anonfun$main$1.apply(SparkStatFormatJob.scala:27)

at com.imooc.spark.SparkStatFormatJob$$anonfun$main$1.apply(SparkStatFormatJob.scala:18)

at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)

at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)

at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$7.apply$mcV$sp(PairRDDFunctions.scala:1211)

at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$7.apply(PairRDDFunctions.scala:1210)

at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$7.apply(PairRDDFunctions.scala:1210)

at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1341)

at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13.apply(PairRDDFunctions.scala:1218)

at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13.apply(PairRDDFunctions.scala:1197)

at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)

at org.apache.spark.scheduler.Task.run(Task.scala:99)

at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)

at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)

at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)

at java.lang.Thread.run(Thread.java:748)

17/07/14 11:07:58 WARN TaskSetManager: Lost task 1.0 in stage 0.0 (TID 1, localhost, executor driver): java.lang.ArrayIndexOutOfBoundsException: 599

at sun.util.calendar.BaseCalendar.getCalendarDateFromFixedDate(BaseCalendar.java:453)

at java.util.GregorianCalendar.computeFields(GregorianCalendar.java:2397)

at java.util.GregorianCalendar.computeFields(GregorianCalendar.java:2312)

at java.util.Calendar.setTimeInMillis(Calendar.java:1804)

at java.util.Calendar.setTime(Calendar.java:1770)

at java.text.SimpleDateFormat.format(SimpleDateFormat.java:943)

at java.text.SimpleDateFormat.format(SimpleDateFormat.java:936)

at java.text.DateFormat.format(DateFormat.java:345)

at com.imooc.spark.DataUtils$.parse(DataUtils.scala:20)

at com.imooc.spark.SparkStatFormatJob$$anonfun$main$1.apply(SparkStatFormatJob.scala:27)

at com.imooc.spark.SparkStatFormatJob$$anonfun$main$1.apply(SparkStatFormatJob.scala:18)

at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)

at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)

at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$7.apply$mcV$sp(PairRDDFunctions.scala:1211)

at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$7.apply(PairRDDFunctions.scala:1210)

at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$7.apply(PairRDDFunctions.scala:1210)

at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1341)

at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13.apply(PairRDDFunctions.scala:1218)

at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13.apply(PairRDDFunctions.scala:1197)

at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)

at org.apache.spark.scheduler.Task.run(Task.scala:99)

at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)

at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)

at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)

at java.lang.Thread.run(Thread.java:748)

17/07/14 11:07:58 INFO DAGScheduler: ResultStage 0 (saveAsTextFile at SparkStatFormatJob.scala:28) failed in 1.158 s due to Job aborted due to stage failure: Task 1 in stage 0.0 failed 1 times, most recent failure: Lost task 1.0 in stage 0.0 (TID 1, localhost, executor driver): java.lang.ArrayIndexOutOfBoundsException: 599

at sun.util.calendar.BaseCalendar.getCalendarDateFromFixedDate(BaseCalendar.java:453)

at java.util.GregorianCalendar.computeFields(GregorianCalendar.java:2397)

at java.util.GregorianCalendar.computeFields(GregorianCalendar.java:2312)

at java.util.Calendar.setTimeInMillis(Calendar.java:1804)

at java.util.Calendar.setTime(Calendar.java:1770)

at java.text.SimpleDateFormat.format(SimpleDateFormat.java:943)

at java.text.SimpleDateFormat.format(SimpleDateFormat.java:936)

at java.text.DateFormat.format(DateFormat.java:345)

at com.imooc.spark.DataUtils$.parse(DataUtils.scala:20)

at com.imooc.spark.SparkStatFormatJob$$anonfun$main$1.apply(SparkStatFormatJob.scala:27)

at com.imooc.spark.SparkStatFormatJob$$anonfun$main$1.apply(SparkStatFormatJob.scala:18)

at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)

at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)

at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$7.apply$mcV$sp(PairRDDFunctions.scala:1211)

at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$7.apply(PairRDDFunctions.scala:1210)

at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13$$anonfun$apply$7.apply(PairRDDFunctions.scala:1210)

at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1341)

at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13.apply(PairRDDFunctions.scala:1218)

at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1$$anonfun$13.apply(PairRDDFunctions.scala:1197)

at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)

at org.apache.spark.scheduler.Task.run(Task.scala:99)

at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)

at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)

at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)

at java.lang.Thread.run(Thread.java:748)


写回答

2回答

Michael_PK

2017-07-14

在代码中添加日志输出,定位到哪一行数据有问题先

0
2
进击的大黄鸭
非常感谢!
2017-07-14
共2条回复

进击的大黄鸭

提问者

2017-07-14

在windows本地的话貌似要设置hadoop的环境变量?不然会报空指针错误:java.lang.NullPointerException 设置方法看这篇文章 http://blog.csdn.net/kimyoungvon/article/details/51308651 设置好了就会正常输入output 我是这样的 有类似问题的同学可以借鉴

2
0

以慕课网日志分析为例 进入大数据Spark SQL的世界

快速转型大数据:Hadoop,Hive,SparkSQL步步为赢

1644 学习 · 1129 问题

查看课程