From c0563d3198b2468983c4eb4f9b4678094ec9dcca Mon Sep 17 00:00:00 2001 From: Bennet Meyers Date: Sun, 17 Feb 2019 18:07:30 -0800 Subject: [PATCH 1/7] pvdaq io functions --- pvlib/iotools/pvdaq.py | 124 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 pvlib/iotools/pvdaq.py diff --git a/pvlib/iotools/pvdaq.py b/pvlib/iotools/pvdaq.py new file mode 100644 index 0000000000..ffd14a5a18 --- /dev/null +++ b/pvlib/iotools/pvdaq.py @@ -0,0 +1,124 @@ +"""Functions to read NREL PVDAQ data +""" + +from time import time +from io import StringIO +import sys +from datetime import timedelta + +import requests +import numpy as np +import pandas as pd + +def get_pvdaq_data(sysid=2, api_key = 'DEMO_KEY', year=2011, delim=','): + """This fuction queries one or more years of raw PV system data from NREL's PVDAQ data service: + https://maps.nrel.gov/pvdaq/ + + This function uses the annual raw data file API, which is the most efficient way of accessing + multi-year, sub-hourly time series data. + + Parameters + ---------- + sysid: int + The system ID corresponding to the site that data should be queried from + api_key: string + Your API key (https://developer.nrel.gov/docs/api-key/) + year: int of list of ints + Either the year to request or the list of years to request. Multiple years will be concatenated + into a single data frame + delim: string + The deliminator used in the CSV file being requested + + Returns + ------- + label: pandas data frame + A data frame containing the tabular time series data from the PVDAQ service over the years + requested + + """ + # Force year to be a list of integers + ti = time() + try: + year = int(year) + except TypeError: + year = [int(yr) for yr in year] + else: + year = [year] + # Each year must queries separately, so iterate over the years and generate a list of dataframes. + df_list = [] + it = 0 + for yr in year: + progress(it, len(year), 'querying year {}'.format(year[it])) + req_params = { + 'api_key': api_key, + 'system_id': sysid, + 'year': yr + } + base_url = 'https://developer.nrel.gov/api/pvdaq/v3/data_file?' + param_list = [str(item[0]) + '=' + str(item[1]) for item in req_params.items()] + req_url = base_url + '&'.join(param_list) + response = requests.get(req_url) + if int(response.status_code) != 200: + print('\n error: ', response.status_code) + return + df = pd.read_csv(StringIO(response.text), delimiter=delim) + df_list.append(df) + it += 1 + tf = time() + progress(it, len(year), 'queries complete in {:.1f} seconds '.format(tf - ti)) + # concatenate the list of yearly data frames + df = pd.concat(df_list, axis=0, sort=True) + # convert index to timeseries + df = standardize_time_axis(df, datetimekey='Date-Time') + return df + +def standardize_time_axis(df, datetimekey='Date-Time'): + ''' + This function takes in a pandas data frame containing tabular time series data, likely generated with a call to + pandas.read_csv(). It is assumed that each row of the data frame corresponds to a unique date-time, though not + necessarily on standard intervals. This function will attempt to convert a user-specified column containing time + stamps to python datetime objects, assign this column to the index of the data frame, and then standardize the + index over time. By standardize, we mean reconstruct the index to be at regular intervals, starting at midnight of + the first day of the data set. This solves a couple common data errors when working with raw data. (1) Missing data + points from skipped scans in the data acquisition system. (2) Time stamps that are at irregular exact times, + including fractional seconds. + :param df: A pandas data frame containing the tabular time series data + :param datetimekey: An optional key corresponding to the name of the column that contains the time stamps + :return: A new data frame with a standardized time axis + ''' + # convert index to timeseries + try: + df[datetimekey] = pd.to_datetime(df[datetimekey]) + df.set_index('Date-Time', inplace=True) + except KeyError: + time_cols = [col for col in df.columns if np.logical_or('Time' in col, 'time' in col)] + key = time_cols[0] + df[datetimekey] = pd.to_datetime(df[key]) + df.set_index(datetimekey, inplace=True) + # standardize the timeseries axis to a regular frequency over a full set of days + diff = (df.index[1:] - df.index[:-1]).seconds + freq = int(np.median(diff)) # the number of seconds between each measurement + start = df.index[0] + end = df.index[-1] + time_index = pd.date_range(start=start.date(), end=end.date() + timedelta(days=1), freq='{}s'.format(freq))[:-1] + df = df.reindex(index=time_index, method='nearest') + return df.fillna(value=0) + + +def progress(count, total, status=''): + """ + Python command line progress bar in less than 10 lines of code. ยท GitHub + https://gist.github.com/vladignatyev/06860ec2040cb497f0f3 + :param count: the current count, int + :param total: to total count, int + :param status: a message to display + :return: + """ + bar_len = 60 + filled_len = int(round(bar_len * count / float(total))) + + percents = round(100.0 * count / float(total), 1) + bar = '=' * filled_len + '-' * (bar_len - filled_len) + + sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', status)) + sys.stdout.flush() \ No newline at end of file From 32bda8aa09fcd94fcb3c5f38ecc1075d02ab734c Mon Sep 17 00:00:00 2001 From: Bennet Meyers Date: Mon, 18 Feb 2019 10:56:18 -0800 Subject: [PATCH 2/7] fixing line length issues --- pvlib/iotools/pvdaq.py | 67 ++++++++++++++++++++++++++---------------- 1 file changed, 42 insertions(+), 25 deletions(-) diff --git a/pvlib/iotools/pvdaq.py b/pvlib/iotools/pvdaq.py index ffd14a5a18..f9fa5e8279 100644 --- a/pvlib/iotools/pvdaq.py +++ b/pvlib/iotools/pvdaq.py @@ -11,29 +11,30 @@ import pandas as pd def get_pvdaq_data(sysid=2, api_key = 'DEMO_KEY', year=2011, delim=','): - """This fuction queries one or more years of raw PV system data from NREL's PVDAQ data service: - https://maps.nrel.gov/pvdaq/ + """This fuction queries one or more years of raw PV system data from NREL's + PVDAQ data service: https://maps.nrel.gov/pvdaq/ - This function uses the annual raw data file API, which is the most efficient way of accessing - multi-year, sub-hourly time series data. + This function uses the annual raw data file API, which is the most + efficient way of accessing multi-year, sub-hourly time series data. Parameters ---------- sysid: int - The system ID corresponding to the site that data should be queried from + The system ID corresponding to the site that data should be + queried from api_key: string Your API key (https://developer.nrel.gov/docs/api-key/) year: int of list of ints - Either the year to request or the list of years to request. Multiple years will be concatenated - into a single data frame + Either the year to request or the list of years to request. Multiple + years will be concatenated into a single data frame delim: string The deliminator used in the CSV file being requested Returns ------- label: pandas data frame - A data frame containing the tabular time series data from the PVDAQ service over the years - requested + A data frame containing the tabular time series data from the PVDAQ + service over the years requested """ # Force year to be a list of integers @@ -44,7 +45,8 @@ def get_pvdaq_data(sysid=2, api_key = 'DEMO_KEY', year=2011, delim=','): year = [int(yr) for yr in year] else: year = [year] - # Each year must queries separately, so iterate over the years and generate a list of dataframes. + # Each year must queries separately, so iterate over the years and + # generate a list of dataframes. df_list = [] it = 0 for yr in year: @@ -55,7 +57,8 @@ def get_pvdaq_data(sysid=2, api_key = 'DEMO_KEY', year=2011, delim=','): 'year': yr } base_url = 'https://developer.nrel.gov/api/pvdaq/v3/data_file?' - param_list = [str(item[0]) + '=' + str(item[1]) for item in req_params.items()] + param_list = [str(item[0]) + '=' + str(item[1]) + for item in req_params.items()] req_url = base_url + '&'.join(param_list) response = requests.get(req_url) if int(response.status_code) != 200: @@ -65,7 +68,8 @@ def get_pvdaq_data(sysid=2, api_key = 'DEMO_KEY', year=2011, delim=','): df_list.append(df) it += 1 tf = time() - progress(it, len(year), 'queries complete in {:.1f} seconds '.format(tf - ti)) + msg = 'queries complete in {:.1f} seconds '.format(tf - ti) + progress(it, len(year), msg) # concatenate the list of yearly data frames df = pd.concat(df_list, axis=0, sort=True) # convert index to timeseries @@ -74,16 +78,23 @@ def get_pvdaq_data(sysid=2, api_key = 'DEMO_KEY', year=2011, delim=','): def standardize_time_axis(df, datetimekey='Date-Time'): ''' - This function takes in a pandas data frame containing tabular time series data, likely generated with a call to - pandas.read_csv(). It is assumed that each row of the data frame corresponds to a unique date-time, though not - necessarily on standard intervals. This function will attempt to convert a user-specified column containing time - stamps to python datetime objects, assign this column to the index of the data frame, and then standardize the - index over time. By standardize, we mean reconstruct the index to be at regular intervals, starting at midnight of - the first day of the data set. This solves a couple common data errors when working with raw data. (1) Missing data - points from skipped scans in the data acquisition system. (2) Time stamps that are at irregular exact times, - including fractional seconds. + This function takes in a pandas data frame containing tabular time series + data, likely generated with a call to pandas.read_csv(). It is assumed that + each row of the data frame corresponds to a unique date-time, though not + necessarily on standard intervals. This function will attempt to convert a + user-specified column containing time stamps to python datetime objects, + assign this column to the index of the data frame, and then standardize the + index over time. By standardize, we mean reconstruct the index to be at + regular intervals, starting at midnight of the first day of the data set. + This solves a couple common data errors when working with raw data. + (1) Missing data points from skipped scans in the data acquisition + system. + (2) Time stamps that are at irregular exact times, including fractional + seconds. + :param df: A pandas data frame containing the tabular time series data - :param datetimekey: An optional key corresponding to the name of the column that contains the time stamps + :param datetimekey: An optional key corresponding to the name of the column + that contains the time stamps :return: A new data frame with a standardized time axis ''' # convert index to timeseries @@ -91,16 +102,22 @@ def standardize_time_axis(df, datetimekey='Date-Time'): df[datetimekey] = pd.to_datetime(df[datetimekey]) df.set_index('Date-Time', inplace=True) except KeyError: - time_cols = [col for col in df.columns if np.logical_or('Time' in col, 'time' in col)] + time_cols = [col for col in df.columns + if np.logical_or('Time' in col, 'time' in col)] key = time_cols[0] df[datetimekey] = pd.to_datetime(df[key]) df.set_index(datetimekey, inplace=True) - # standardize the timeseries axis to a regular frequency over a full set of days + # standardize the timeseries axis to a regular frequency over + # a full set of days diff = (df.index[1:] - df.index[:-1]).seconds - freq = int(np.median(diff)) # the number of seconds between each measurement + freq = int(np.median(diff)) # the number of secs between each measurement start = df.index[0] end = df.index[-1] - time_index = pd.date_range(start=start.date(), end=end.date() + timedelta(days=1), freq='{}s'.format(freq))[:-1] + time_index = pd.date_range( + start=start.date(), + end=end.date() + timedelta(days=1), + freq='{}s'.format(freq) + )[:-1] df = df.reindex(index=time_index, method='nearest') return df.fillna(value=0) From a6373c872bc7f1d19ccbaa0fb0bdcdae6f562543 Mon Sep 17 00:00:00 2001 From: Bennet Meyers Date: Mon, 18 Feb 2019 10:58:05 -0800 Subject: [PATCH 3/7] importing new function in package init --- pvlib/iotools/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pvlib/iotools/__init__.py b/pvlib/iotools/__init__.py index 112cc6fbcf..9f349418c5 100644 --- a/pvlib/iotools/__init__.py +++ b/pvlib/iotools/__init__.py @@ -7,3 +7,4 @@ from pvlib.iotools.midc import read_midc_raw_data_from_nrel # noqa: F401 from pvlib.iotools.ecmwf_macc import read_ecmwf_macc # noqa: F401 from pvlib.iotools.ecmwf_macc import get_ecmwf_macc # noqa: F401 +from pvlib.iotools.pvdaq import get_pvdaq_data From 6d8eb6696b9cf0e718a6b04f9848c190d7e428bf Mon Sep 17 00:00:00 2001 From: Bennet Meyers Date: Mon, 18 Feb 2019 11:00:33 -0800 Subject: [PATCH 4/7] fixing blank lines --- pvlib/iotools/pvdaq.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pvlib/iotools/pvdaq.py b/pvlib/iotools/pvdaq.py index f9fa5e8279..992a93b125 100644 --- a/pvlib/iotools/pvdaq.py +++ b/pvlib/iotools/pvdaq.py @@ -10,6 +10,7 @@ import numpy as np import pandas as pd + def get_pvdaq_data(sysid=2, api_key = 'DEMO_KEY', year=2011, delim=','): """This fuction queries one or more years of raw PV system data from NREL's PVDAQ data service: https://maps.nrel.gov/pvdaq/ @@ -76,6 +77,7 @@ def get_pvdaq_data(sysid=2, api_key = 'DEMO_KEY', year=2011, delim=','): df = standardize_time_axis(df, datetimekey='Date-Time') return df + def standardize_time_axis(df, datetimekey='Date-Time'): ''' This function takes in a pandas data frame containing tabular time series From 4ce7826e74a9eadbf705781ff62226d53c8b87e6 Mon Sep 17 00:00:00 2001 From: Bennet Meyers Date: Mon, 18 Feb 2019 11:02:15 -0800 Subject: [PATCH 5/7] fixing other lint issues --- pvlib/iotools/pvdaq.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pvlib/iotools/pvdaq.py b/pvlib/iotools/pvdaq.py index 992a93b125..3467a1c7af 100644 --- a/pvlib/iotools/pvdaq.py +++ b/pvlib/iotools/pvdaq.py @@ -11,7 +11,7 @@ import pandas as pd -def get_pvdaq_data(sysid=2, api_key = 'DEMO_KEY', year=2011, delim=','): +def get_pvdaq_data(sysid=2, api_key='DEMO_KEY', year=2011, delim=','): """This fuction queries one or more years of raw PV system data from NREL's PVDAQ data service: https://maps.nrel.gov/pvdaq/ @@ -140,4 +140,4 @@ def progress(count, total, status=''): bar = '=' * filled_len + '-' * (bar_len - filled_len) sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', status)) - sys.stdout.flush() \ No newline at end of file + sys.stdout.flush() From 60041ed22c90543c98aea4136a3627881dd52e81 Mon Sep 17 00:00:00 2001 From: Bennet Meyers Date: Mon, 18 Feb 2019 17:00:50 -0800 Subject: [PATCH 6/7] Adding a newline character after last progress bar write so that the progress bar is not overwritten by a later print statement in a user script. Also added an if __name__ == "__main__" block for testing purposes --- pvlib/iotools/pvdaq.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pvlib/iotools/pvdaq.py b/pvlib/iotools/pvdaq.py index 3467a1c7af..4161ea19b2 100644 --- a/pvlib/iotools/pvdaq.py +++ b/pvlib/iotools/pvdaq.py @@ -71,6 +71,7 @@ def get_pvdaq_data(sysid=2, api_key='DEMO_KEY', year=2011, delim=','): tf = time() msg = 'queries complete in {:.1f} seconds '.format(tf - ti) progress(it, len(year), msg) + print('\n') # concatenate the list of yearly data frames df = pd.concat(df_list, axis=0, sort=True) # convert index to timeseries @@ -141,3 +142,8 @@ def progress(count, total, status=''): sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', status)) sys.stdout.flush() + + +if __name__ == "__main__": + df = get_pvdaq_data() + print(df.head()) From bebcea3eb1b6cfe48b4c21e917576202921db03e Mon Sep 17 00:00:00 2001 From: Bennet Meyers Date: Tue, 19 Feb 2019 18:47:06 -0800 Subject: [PATCH 7/7] allow user option for standardize_time_axis --- pvlib/iotools/pvdaq.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pvlib/iotools/pvdaq.py b/pvlib/iotools/pvdaq.py index 4161ea19b2..1c1f594e8a 100644 --- a/pvlib/iotools/pvdaq.py +++ b/pvlib/iotools/pvdaq.py @@ -11,7 +11,8 @@ import pandas as pd -def get_pvdaq_data(sysid=2, api_key='DEMO_KEY', year=2011, delim=','): +def get_pvdaq_data(sysid=2, api_key='DEMO_KEY', year=2011, delim=',', + standardize=True): """This fuction queries one or more years of raw PV system data from NREL's PVDAQ data service: https://maps.nrel.gov/pvdaq/ @@ -74,8 +75,8 @@ def get_pvdaq_data(sysid=2, api_key='DEMO_KEY', year=2011, delim=','): print('\n') # concatenate the list of yearly data frames df = pd.concat(df_list, axis=0, sort=True) - # convert index to timeseries - df = standardize_time_axis(df, datetimekey='Date-Time') + if standardize: + df = standardize_time_axis(df, datetimekey='Date-Time') return df