wget http://real-chart.finance.yahoo.com/table.csv? s=AAPL&d=6&e=4&f=2015&g=d&a=11&b=12&c=1980&ignore=.csv mv table.csv?s=AAPL table.csv hadoop fs -put ./table.csv /data/ spark-shell --master yarn-client --driver-memory 512m --executor-memory 512m import org.apache.spark.sql._ val base_data = sc.textFile("hdfs://sandbox.hortonworks. com:8020/data/table.csv") val attributes = base_data.first val data = apple_stocks.filter(_(0) != attributes(0)) case class AppleStockRecord(date: String, open: Float, high: Float, low: Float, close: Float, volume: Integer, adjClose: Float) val applestock = data.map(_.split(",")).map(row => AppleStockRecord(row(0), row(1).trim.toFloat, row(2).trim.toFloat, row(3).trim.toFloat, row(4).trim.toFloat, row(5).trim.toInt, row(6).trim.toFloat, row(0).trim.substring(0,4).toInt)).toDF() applestock.registerTempTable("applestock") applestock.show applestock.count output.map(t => "Record: " + t.toString).collect(). foreach(println) val output = sql("SELECT * FROM applestock WHERE close >= open") val output = sql("SELECT MAX(close-open) FROM applestock") val output = sql("SELECT date, high FROM applestock ORDER BY high DESC LIMIT 10") val output = sql("SELECT year, AVG(Volume) FROM applestock WHERE year > 1999 GROUP BY year")