Weather Data Analysis

© Chandan Shastri

In [1]:
import tensorflow as tf

from pyspark.sql import SparkSession
from time import time

Creating a Spark Session :

In [2]:
spark = SparkSession.builder.master('local').appName("csds_test").enableHiveSupport().getOrCreate()
In [3]:
spark
Out[3]:

SparkSession - hive

SparkContext

Spark UI

Version
v3.0.0-preview2
Master
local
AppName
csds_test

Loading Data to Spark DataFrame from MySQL database :

In [4]:
isro = spark.read.jdbc("jdbc:mysql://localhost:3306","rrsc_aws.aws_log",properties={'user':'chandanshastri','password':'superman'})
In [5]:
isro.show(10,truncate=False)
+-------------------+------+---------+-------+------+-----+-----+-----+-----+-----+--------+-------+----------+-------+--------+
|RTIME              |REC_NO|Battery_V|AirTemp|RelHum|WS   |WD   |RAINt|RAINa|VWC  |SoilTemp|SlrAvg |SlrKjTot  |SlrWha |Slr_kJ_a|
+-------------------+------+---------+-------+------+-----+-----+-----+-----+-----+--------+-------+----------+-------+--------+
|2017-04-04 16:30:00|8     |13.31    |34.14  |29.65 |0.394|157.2|0.000|0.000|35.78|27.86   |121.400|218.480400|524.808|1889.308|
|2017-04-04 16:45:00|9     |13.28    |34.02  |31.51 |0.128|292.2|0.000|0.000|35.84|27.92   |99.700 |179.383700|574.636|2068.691|
|2017-04-04 17:00:00|10    |13.24    |33.65  |31.36 |0.183|184.9|0.000|0.000|35.84|27.97   |79.200 |142.552400|614.234|2211.243|
|2017-04-04 17:15:00|11    |13.23    |33.36  |31.74 |0.108|169.8|0.000|0.000|35.84|28.02   |60.500 |108.905900|644.486|2320.150|
|2017-04-04 17:30:00|12    |13.23    |33.30  |31.03 |0.175|210.4|0.000|0.000|35.83|28.06   |48.370 |87.061010 |668.669|2407.209|
|2017-04-04 17:45:00|13    |13.20    |32.99  |30.94 |0.279|306.4|0.000|0.000|35.83|28.11   |28.310 |50.952890 |682.823|2458.161|
|2017-04-04 18:00:00|14    |13.19    |32.54  |31.94 |0.251|271.6|0.000|0.000|35.83|28.15   |16.150 |29.061720 |690.896|2487.223|
|2017-04-04 18:15:00|15    |13.15    |32.26  |32.98 |0.205|219.9|0.000|0.000|35.83|28.19   |3.102  |5.584118  |692.447|2492.809|
|2017-04-04 18:30:00|16    |13.15    |31.97  |33.60 |0.014|82.1 |0.000|0.000|35.83|28.23   |0.015  |0.026112  |692.454|2492.835|
|2017-04-04 18:45:00|17    |13.14    |31.66  |34.41 |0.307|322.9|0.000|0.000|35.83|28.26   |0.000  |0.000000  |692.454|2492.835|
+-------------------+------+---------+-------+------+-----+-----+-----+-----+-----+--------+-------+----------+-------+--------+
only showing top 10 rows

Total Number of rows in our data

In [6]:
isro.count()
Out[6]:
29825
In [7]:
isro.printSchema()
root
 |-- RTIME: timestamp (nullable = true)
 |-- REC_NO: integer (nullable = true)
 |-- Battery_V: decimal(10,2) (nullable = true)
 |-- AirTemp: decimal(10,2) (nullable = true)
 |-- RelHum: decimal(10,2) (nullable = true)
 |-- WS: decimal(10,3) (nullable = true)
 |-- WD: decimal(10,1) (nullable = true)
 |-- RAINt: decimal(10,3) (nullable = true)
 |-- RAINa: decimal(10,3) (nullable = true)
 |-- VWC: decimal(10,2) (nullable = true)
 |-- SoilTemp: decimal(10,2) (nullable = true)
 |-- SlrAvg: decimal(10,3) (nullable = true)
 |-- SlrKjTot: decimal(10,6) (nullable = true)
 |-- SlrWha: decimal(10,3) (nullable = true)
 |-- Slr_kJ_a: decimal(10,3) (nullable = true)

Selecting only the days on which there was Rainfall :

In [8]:
test = isro.filter((isro.RAINt>1)).select('RTIME','AirTemp','RelHum','RAINt')
In [9]:
test.show(10,truncate=False)
+-------------------+-------+------+-----+
|RTIME              |AirTemp|RelHum|RAINt|
+-------------------+-------+------+-----+
|2017-04-05 21:15:00|21.95  |81.30 |8.890|
|2017-04-06 20:45:00|23.38  |81.20 |1.016|
|2017-04-06 21:15:00|24.82  |78.76 |1.524|
|2017-04-06 21:30:00|22.39  |91.50 |1.270|
|2017-04-17 16:15:00|24.06  |66.82 |7.112|
|2017-05-09 19:00:00|25.42  |73.37 |1.270|
|2017-05-09 19:15:00|22.69  |90.90 |1.778|
|2017-05-10 19:45:00|22.85  |85.80 |1.524|
|2017-05-11 14:30:00|25.95  |80.70 |2.540|
|2017-05-14 15:45:00|29.13  |73.50 |1.016|
+-------------------+-------+------+-----+
only showing top 10 rows

Average temperature for the whole period :

In [10]:
t=time()
isro.agg({"AirTemp":"avg"}).show()
print("Total time taken : " + str(time()-t) + " ms")
+------------+
|avg(AirTemp)|
+------------+
|   23.502284|
+------------+

Total time taken : 0.4559957981109619 ms

Plotting a Time vs Temp Graph for Rainy days :

In [11]:
import matplotlib.pyplot as plt
In [12]:
rtime = test.select('RTIME').collect()
In [13]:
temp = test.select('AirTemp').collect()
In [14]:
plt.plot(rtime,temp)
Out[14]:
[<matplotlib.lines.Line2D at 0x1e6df6acec8>]
In [15]:
isro.write.csv('isro.csv')
In [16]:
df3 = spark.read.csv('isro.csv',inferSchema=True)
In [17]:
df3.count()
Out[17]:
29825
In [18]:
df3.printSchema()
root
 |-- _c0: timestamp (nullable = true)
 |-- _c1: integer (nullable = true)
 |-- _c2: double (nullable = true)
 |-- _c3: double (nullable = true)
 |-- _c4: double (nullable = true)
 |-- _c5: double (nullable = true)
 |-- _c6: double (nullable = true)
 |-- _c7: double (nullable = true)
 |-- _c8: double (nullable = true)
 |-- _c9: double (nullable = true)
 |-- _c10: double (nullable = true)
 |-- _c11: double (nullable = true)
 |-- _c12: double (nullable = true)
 |-- _c13: double (nullable = true)
 |-- _c14: double (nullable = true)

In [19]:
spark.sql("create database chandan")
Out[19]:
DataFrame[]
In [20]:
# test.write.jdbc("jdbc:mysql://localhost:3306","rrsc_aws.aws_test",properties={'user':'chandanshastri','password':'superman'})
In [21]:
test.write.mode("append").saveAsTable("chandan.test")
In [23]:
spark.sql("use chandan").show()
++
||
++
++

In [24]:
spark.sql("show tables").show()
+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
| chandan|     test|      false|
+--------+---------+-----------+

In [25]:
spark.sql("select * from test").show()
+-------------------+-------+------+-----+
|              RTIME|AirTemp|RelHum|RAINt|
+-------------------+-------+------+-----+
|2017-04-05 21:15:00|  21.95| 81.30|8.890|
|2017-04-06 20:45:00|  23.38| 81.20|1.016|
|2017-04-06 21:15:00|  24.82| 78.76|1.524|
|2017-04-06 21:30:00|  22.39| 91.50|1.270|
|2017-04-17 16:15:00|  24.06| 66.82|7.112|
|2017-05-09 19:00:00|  25.42| 73.37|1.270|
|2017-05-09 19:15:00|  22.69| 90.90|1.778|
|2017-05-10 19:45:00|  22.85| 85.80|1.524|
|2017-05-11 14:30:00|  25.95| 80.70|2.540|
|2017-05-14 15:45:00|  29.13| 73.50|1.016|
|2017-05-14 16:00:00|  23.52| 83.10|4.572|
|2017-05-14 16:15:00|  22.57| 78.99|1.524|
|2017-05-17 16:00:00|  27.66| 75.14|1.270|
|2017-05-17 21:45:00|  22.68| 89.10|4.064|
|2017-05-17 22:00:00|  20.49| 94.20|2.032|
|2017-05-17 22:15:00|  21.04| 94.20|2.286|
|2017-05-18 21:30:00|  20.64| 94.40|1.270|
|2017-05-18 21:45:00|  20.43| 94.60|1.270|
|2017-05-18 22:00:00|  20.16| 95.70|1.778|
|2017-05-18 22:15:00|  20.25| 96.00|1.778|
+-------------------+-------+------+-----+
only showing top 20 rows

In [31]:
df3.createTempView("test4view")
In [33]:
spark.sql("show tables").show()
+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
| chandan|     test|      false|
|        |test4view|       true|
+--------+---------+-----------+

In [34]:
spark.sql("CREATE TABLE chandan.test4 LIKE chandan.test location 'F:/HDFS/'")
Out[34]:
DataFrame[]
In [35]:
spark.sql("show tables").show()
+--------+---------+-----------+
|database|tableName|isTemporary|
+--------+---------+-----------+
| chandan|     test|      false|
| chandan|    test4|      false|
|        |test4view|       true|
+--------+---------+-----------+

In [38]:
spark.sql("insert into chandan.test4 select * from test")
Out[38]:
DataFrame[]
In [39]:
spark.sql("select * from chandan.test4").show()
+-------------------+-------+------+-----+
|              RTIME|AirTemp|RelHum|RAINt|
+-------------------+-------+------+-----+
|2017-04-05 21:15:00|  21.95| 81.30|8.890|
|2017-04-06 20:45:00|  23.38| 81.20|1.016|
|2017-04-06 21:15:00|  24.82| 78.76|1.524|
|2017-04-06 21:30:00|  22.39| 91.50|1.270|
|2017-04-17 16:15:00|  24.06| 66.82|7.112|
|2017-05-09 19:00:00|  25.42| 73.37|1.270|
|2017-05-09 19:15:00|  22.69| 90.90|1.778|
|2017-05-10 19:45:00|  22.85| 85.80|1.524|
|2017-05-11 14:30:00|  25.95| 80.70|2.540|
|2017-05-14 15:45:00|  29.13| 73.50|1.016|
|2017-05-14 16:00:00|  23.52| 83.10|4.572|
|2017-05-14 16:15:00|  22.57| 78.99|1.524|
|2017-05-17 16:00:00|  27.66| 75.14|1.270|
|2017-05-17 21:45:00|  22.68| 89.10|4.064|
|2017-05-17 22:00:00|  20.49| 94.20|2.032|
|2017-05-17 22:15:00|  21.04| 94.20|2.286|
|2017-05-18 21:30:00|  20.64| 94.40|1.270|
|2017-05-18 21:45:00|  20.43| 94.60|1.270|
|2017-05-18 22:00:00|  20.16| 95.70|1.778|
|2017-05-18 22:15:00|  20.25| 96.00|1.778|
+-------------------+-------+------+-----+
only showing top 20 rows

In [ ]: