thirdwave

Github Mirror

Doc

import pandas as pd
df = pd.read_csv('nytemp.csv',header=None,names=['temp'])
df['year'] = range(1869,2018)
print df.head()
plt.title('January Average NY Temparature')
df.temp.plot()
pd.rolling_mean(df.temp, window=9).plot(color='red')
plt.savefig('nytemp.png')

   temp  year
0  35.1  1869
1  37.5  1870
2  28.3  1871
3  28.8  1872
4  28.6  1873
###############################3

Some scraping code

from datetime import datetime, timedelta
from urllib2 import urlopen
from bs4 import BeautifulSoup
import os

def parse_station(station, current_date):

        with open('{}/{}-{}-{}.html'.format(station,
                                            current_date.year,
                                            current_date.month,
                                            current_date.day)) as in_file:
            soup = BeautifulSoup(in_file.read(), 'html.parser')
            weather_data = soup.find(id='historyTable').find_all('span', class_='wx-value')
            weather_data_units = soup.find(id='historyTable').find_all('td')
            actual_max_temp = weather_data[2].text
            actual_min_temp = weather_data[5].text
            return actual_min_temp, actual_max_temp

def scrape_station(station):
    syear = 1970
    eyear = 2018
    current_date = datetime(year=syear, month=1, day=1)
    if os.path.isdir(station) == False: os.mkdir(station)
    lookup_URL = 'http://www.wunderground.com/history/airport/{}/{}/{}/{}/DailyHistory.html'
    fout = open("nytemp.csv","a")

    for yr in range(syear,eyear):
        current_date=current_date.replace(year=yr)
        for day in (range(1,30)):
                current_date=current_date.replace(day=day)
                out_file_name = '{}/{}-{}-{}.html'.format(station, current_date.year,
                                                          current_date.month,
                                                          current_date.day)

                formatted_lookup_URL = lookup_URL.format(station,
                                                         current_date.year,
                                                         current_date.month,
                                                         current_date.day)
                print formatted_lookup_URL
                if os.path.isfile(out_file_name)==False:                
                        html = urlopen(formatted_lookup_URL).read().decode('utf-8')
                        with open(out_file_name, 'w') as out_file: out_file.write(html)
                        min,max=parse_station('KNYC', current_date)
                        fout.write("%s,%s,%s\n" % (current_date,min,max))
                        fout.flush()
                else:
                        print 'skipping...'

scrape_station('KNYC')