Source code for pygem.pygem_modelsetup

"""
Python Glacier Evolution Model (PyGEM)

copyright © 2018 David Rounce <drounce@cmu.edu

Distrubted under the MIT lisence

List of functions used to set up different aspects of the model
"""

# Built-in libaries
import glob
import warnings
from datetime import datetime

import numpy as np

# External libraries
import pandas as pd

from pygem.setup.config import ConfigManager

# instantiate ConfigManager
config_manager = ConfigManager()
# read the config
pygem_prms = config_manager.read_config()


[docs] def datesmodelrun( startyear=pygem_prms['climate']['ref_startyear'], endyear=pygem_prms['climate']['ref_endyear'], spinupyears=0, option_wateryear=pygem_prms['climate']['ref_wateryear'], ): """ Create table of year, month, day, water year, season and number of days in the month. Parameters ---------- startyear : int starting year endyear : int ending year spinupyears : int number of spinup years Returns ------- dates_table : pd.DataFrame table where each row is a timestep and each column is attributes (date, year, wateryear, etc.) of that timestep """ # Include spinup time in start year startyear_wspinup = startyear - spinupyears # Convert start year into date depending on option_wateryear if option_wateryear == 'hydro': startdate = str(startyear_wspinup - 1) + '-10-01' enddate = str(endyear) + '-09-30' elif option_wateryear == 'calendar': startdate = str(startyear_wspinup) + '-01-01' enddate = str(endyear) + '-12-31' elif option_wateryear == 'custom': startdate = str(startyear_wspinup) + '-' + pygem_prms['time']['startmonthday'] enddate = str(endyear) + '-' + pygem_prms['time']['endmonthday'] else: assert True == False, '\n\nError: Select an option_wateryear that exists.\n' # Convert input format into proper datetime format startdate = datetime(*[int(item) for item in startdate.split('-')]) enddate = datetime(*[int(item) for item in enddate.split('-')]) if pygem_prms['time']['timestep'] == 'monthly': startdate = startdate.strftime('%Y-%m') enddate = enddate.strftime('%Y-%m') elif pygem_prms['time']['timestep'] == 'daily': startdate = startdate.strftime('%Y-%m-%d') enddate = enddate.strftime('%Y-%m-%d') # Generate dates_table using date_range function if pygem_prms['time']['timestep'] == 'monthly': # Automatically generate dates from start date to end data using a monthly frequency (MS), which generates # monthly data using the 1st of each month' dates_table = pd.DataFrame( {'date': pd.date_range(startdate, enddate, freq='MS', unit='s')} ) # Select attributes of DateTimeIndex (dt.year, dt.month, and dt.daysinmonth) dates_table['year'] = dates_table['date'].dt.year dates_table['month'] = dates_table['date'].dt.month dates_table['daysinmonth'] = dates_table['date'].dt.daysinmonth dates_table['timestep'] = np.arange(len(dates_table['date'])) # Set date as index dates_table.set_index('timestep', inplace=True) # Remove leap year days if user selected this with option_leapyear if pygem_prms['time']['option_leapyear'] == 0: mask1 = dates_table['daysinmonth'] == 29 dates_table.loc[mask1, 'daysinmonth'] = 28 elif pygem_prms['time']['timestep'] == 'daily': # Automatically generate daily (freq = 'D') dates dates_table = pd.DataFrame( {'date': pd.date_range(startdate, enddate, freq='D')} ) # Extract attributes for dates_table dates_table['year'] = dates_table['date'].dt.year dates_table['month'] = dates_table['date'].dt.month dates_table['day'] = dates_table['date'].dt.day dates_table['daysinmonth'] = dates_table['date'].dt.daysinmonth # Set date as index dates_table.set_index('date', inplace=True) # Remove leap year days if user selected this with option_leapyear if pygem_prms['time']['option_leapyear'] == 0: # First, change 'daysinmonth' number mask1 = dates_table['daysinmonth'] == 29 dates_table.loc[mask1, 'daysinmonth'] = 28 # Next, remove the 29th days from the dates mask2 = (dates_table['month'] == 2) & (dates_table['day'] == 29) dates_table.drop(dates_table[mask2].index, inplace=True) else: print( "\n\nError: Please select 'daily' or 'monthly' for gcm_timestep. Exiting model run now.\n" ) exit() # Add column for water year # Water year for northern hemisphere using USGS definition (October 1 - September 30th), # e.g., water year for 2000 is from October 1, 1999 - September 30, 2000 dates_table['wateryear'] = dates_table['year'] for step in range(dates_table.shape[0]): if dates_table.loc[step, 'month'] >= 10: dates_table.loc[step, 'wateryear'] = dates_table.loc[step, 'year'] + 1 # Add column for seasons # create a season dictionary to assist groupby functions seasondict = {} month_list = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12] season_list = [] for i in range(len(month_list)): if ( month_list[i] >= pygem_prms['time']['summer_month_start'] and month_list[i] < pygem_prms['time']['winter_month_start'] ): season_list.append('summer') seasondict[month_list[i]] = season_list[i] else: season_list.append('winter') seasondict[month_list[i]] = season_list[i] dates_table['season'] = dates_table['month'].apply(lambda x: seasondict[x]) return dates_table
[docs] def daysinmonth(year, month): """ Return days in month based on the month and year Parameters ---------- year : str month : str Returns ------- integer of the days in the month """ if year % 4 == 0: daysinmonth_dict = { 1: 31, 2: 29, 3: 31, 4: 30, 5: 31, 6: 30, 7: 31, 8: 31, 9: 30, 10: 31, 11: 30, 12: 31, } else: daysinmonth_dict = { 1: 31, 2: 28, 3: 31, 4: 30, 5: 31, 6: 30, 7: 31, 8: 31, 9: 30, 10: 31, 11: 30, 12: 31, } return daysinmonth_dict[month]
[docs] def hypsometrystats(hyps_table, thickness_table): """Calculate the volume and mean associated with the hypsometry data. Output is a series of the glacier volume [km**3] and mean elevation values [m a.s.l.]. """ # Glacier volume [km**3] glac_volume = (hyps_table * thickness_table / 1000).sum(axis=1).values # Mean glacier elevation glac_hyps_mean = np.zeros(glac_volume.shape) glac_hyps_mean[glac_volume > 0] = ( hyps_table[glac_volume > 0].values * hyps_table[glac_volume > 0].columns.values.astype(int) ).sum(axis=1) / hyps_table[glac_volume > 0].values.sum(axis=1) # Median computations # main_glac_hyps_cumsum = np.cumsum(hyps_table, axis=1) # for glac in range(hyps_table.shape[0]): # # Median glacier elevation # # Computed as the elevation when the normalized cumulative sum of the glacier area exceeds 0.5 (50%) # series_glac_hyps_cumsumnorm = main_glac_hyps_cumsum.loc[glac,:].copy() / glac_area.iloc[glac] # series_glac_hyps_cumsumnorm_positions = (np.where(series_glac_hyps_cumsumnorm > 0.5))[0] # glac_hyps_median = main_glac_hyps.columns.values[series_glac_hyps_cumsumnorm_positions[0]] # NOTE THERE IS A 20 m (+/- 5 m) OFFSET BETWEEN THE 10 m PRODUCT FROM HUSS AND THE RGI INVENTORY """ return glac_volume, glac_hyps_mean
[docs] def import_Husstable( rgi_table, filepath, filedict, drop_col_names, indexname=pygem_prms['rgi']['indexname'], option_shift_elevbins_20m=True, ): """Use the dictionary specified by the user to extract the desired variable. The files must be in the proper units (ice thickness [m], area [km2], width [km]) and should be pre-processed. Output is a Pandas DataFrame of the variable for all the glaciers in the model run (rows = GlacNo, columns = elevation bins). Line Profiling: Loading in the table takes the most time (~2.3 s) """ rgi_regionsO1 = sorted(list(rgi_table.O1Region.unique())) glac_no = [x.split('-')[1] for x in rgi_table.RGIId.values] glac_no_byregion = {} for region in rgi_regionsO1: glac_no_byregion[region] = [] for i in glac_no: region = int(i.split('.')[0]) glac_no_only = i.split('.')[1] glac_no_byregion[int(region)].append(glac_no_only) # Load data for each region for count, region in enumerate(rgi_regionsO1): # Select regional data for indexing glac_no = sorted(glac_no_byregion[region]) rgi_table_region = rgi_table.iloc[ np.where(rgi_table.O1Region.values == region)[0] ] # Load table ds = pd.read_csv(filepath + filedict[region]) # Select glaciers based on 01Index value from main_glac_rgi table # as long as Huss tables have all rows associated with rgi attribute table, then this shortcut works glac_table = ds.iloc[rgi_table_region['O1Index'].values] # Merge multiple regions if count == 0: glac_table_all = glac_table else: # If more columns in region, then need to expand existing dataset if glac_table.shape[1] > glac_table_all.shape[1]: all_col = list(glac_table_all.columns.values) reg_col = list(glac_table.columns.values) new_cols = [item for item in reg_col if item not in all_col] for new_col in new_cols: glac_table_all[new_col] = 0 elif glac_table.shape[1] < glac_table_all.shape[1]: all_col = list(glac_table_all.columns.values) reg_col = list(glac_table.columns.values) new_cols = [item for item in all_col if item not in reg_col] for new_col in new_cols: glac_table[new_col] = 0 glac_table_all = glac_table_all.append(glac_table) # Clean up table and re-index (make copy to avoid SettingWithCopyWarning) glac_table_copy = glac_table_all.copy() glac_table_copy.reset_index(drop=True, inplace=True) glac_table_copy.index.name = indexname # drop columns that are not elevation bins glac_table_copy.drop(drop_col_names, axis=1, inplace=True) # change NAN from -99 to 0 glac_table_copy[glac_table_copy == -99] = 0.0 # Shift Huss bins by 20 m since the elevation bins appear to be 20 m higher than they should be if option_shift_elevbins_20m: colnames = glac_table_copy.columns.tolist()[:-2] glac_table_copy = glac_table_copy.iloc[:, 2:] glac_table_copy.columns = colnames return glac_table_copy
# def selectcalibrationdata(main_glac_rgi): # """ # Select geodetic mass balance of all glaciers in the model run that have a geodetic mass balance. The geodetic mass # balances are stored in a csv file. # """ # # Import .csv file # rgi_region = int(main_glac_rgi.loc[main_glac_rgi.index.values[0],'RGIId'].split('-')[1].split('.')[0]) # ds = pd.read_csv(pygem_prms.cal_mb_filepath + pygem_prms.cal_mb_filedict[rgi_region]) # main_glac_calmassbal = np.zeros((main_glac_rgi.shape[0],4)) # ds[pygem_prms['rgi']['rgi_O1Id_colname']] = ((ds[pygem_prms.cal_rgi_colname] % 1) * 10**5).round(0).astype(int) # ds_subset = ds[[pygem_prms['rgi']['rgi_O1Id_colname'], pygem_prms.massbal_colname, pygem_prms.massbal_uncertainty_colname, # pygem_prms.massbal_time1, pygem_prms.massbal_time2]].values # rgi_O1Id = main_glac_rgi[pygem_prms['rgi']['rgi_O1Id_colname']].values # for glac in range(rgi_O1Id.shape[0]): # try: # # Grab the mass balance based on the RGIId Order 1 glacier number # main_glac_calmassbal[glac,:] = ds_subset[np.where(np.in1d(ds_subset[:,0],rgi_O1Id[glac])==True)[0][0],1:] # # np.in1d searches if there is a match in the first array with the second array provided and returns an # # array with same length as first array and True/False values. np.where then used to identify the index # # where there is a match, which is then used to select the massbalance value # # Use of numpy arrays for indexing and this matching approach is much faster than looping through; however, # # need the for loop because np.in1d does not order the values that match; hence, need to do it 1 at a time # except: # # If there is no mass balance data available for the glacier, then set as NaN # main_glac_calmassbal[glac,:] = np.empty(4) # main_glac_calmassbal[glac,:] = np.nan # main_glac_calmassbal = pd.DataFrame(main_glac_calmassbal, # columns=[pygem_prms.massbal_colname, pygem_prms.massbal_uncertainty_colname, # pygem_prms.massbal_time1, pygem_prms.massbal_time2]) # return main_glac_calmassbal
[docs] def selectglaciersrgitable( glac_no=None, rgi_regionsO1=None, rgi_regionsO2='all', rgi_glac_number='all', rgi_fp=pygem_prms['root'] + pygem_prms['rgi']['rgi_relpath'], rgi_cols_drop=pygem_prms['rgi']['rgi_cols_drop'], rgi_O1Id_colname=pygem_prms['rgi']['rgi_O1Id_colname'], rgi_glacno_float_colname=pygem_prms['rgi']['rgi_glacno_float_colname'], indexname=pygem_prms['rgi']['indexname'], include_landterm=True, include_laketerm=True, include_tidewater=True, glac_no_skip=pygem_prms['setup']['glac_no_skip'], min_glac_area_km2=0, debug=False, ): """ Select all glaciers to be used in the model run according to the regions and glacier numbers defined by the RGI glacier inventory. This function returns the rgi table associated with all of these glaciers. glac_no : list of strings list of strings of RGI glacier numbers (e.g., ['1.00001', '13.00001']) rgi_regionsO1 : list of integers list of integers of RGI order 1 regions (e.g., [1, 13]) rgi_regionsO2 : list of integers or 'all' list of integers of RGI order 2 regions or simply 'all' for all the order 2 regions rgi_glac_number : list of strings list of RGI glacier numbers without the region (e.g., ['00001', '00002']) Output: Pandas DataFrame of the glacier statistics for each glacier in the model run (rows = GlacNo, columns = glacier statistics) """ doc_url = 'https://pygem.readthedocs.io/en/latest/model_inputs.html#model-inputs' doc_meessage = ( 'Has the full dataset been downloaded? See documentation for more information:' ) if glac_no is not None: glac_no_byregion = {} rgi_regionsO1 = [int(i.split('.')[0]) for i in glac_no] rgi_regionsO1 = list(set(rgi_regionsO1)) for region in rgi_regionsO1: glac_no_byregion[region] = [] for i in glac_no: region = i.split('.')[0] glac_no_only = i.split('.')[1] glac_no_byregion[int(region)].append(glac_no_only) for region in rgi_regionsO1: glac_no_byregion[region] = sorted(glac_no_byregion[region]) # Create an empty dataframe rgi_regionsO1 = sorted(rgi_regionsO1) glacier_table = pd.DataFrame() for region in rgi_regionsO1: if glac_no is not None: rgi_glac_number = glac_no_byregion[region] fp = glob.glob(f'{rgi_fp}/{str(region).zfill(2)}*.csv') if len(fp) != 1: raise FileNotFoundError( f'No RGI file found for region {region} in {rgi_fp}. {doc_meessage} <a href="{doc_url}">{doc_url}</a>' ) fp = fp[0] try: csv_regionO1 = pd.read_csv(fp) except: csv_regionO1 = pd.read_csv(fp, encoding='latin1') # warn if RGI table only has one glacier - maybe the full dataset wasn't downloaded after initialization if rgi_glac_number == 'all' and len(csv_regionO1) == 1: warnings.warn( f'Only one glacier in region {region}.\n{doc_meessage} {doc_url}', UserWarning, stacklevel=2, ) # Populate glacer_table with the glaciers of interest if rgi_regionsO2 == 'all' and rgi_glac_number == 'all': if debug: print( 'All glaciers within region(s) %s are included in this model run.' % (region) ) if glacier_table.empty: glacier_table = csv_regionO1 else: glacier_table = pd.concat([glacier_table, csv_regionO1], axis=0) elif rgi_regionsO2 != 'all' and rgi_glac_number == 'all': if debug: print( 'All glaciers within subregion(s) %s in region %s are included in this model run.' % (rgi_regionsO2, region) ) for regionO2 in rgi_regionsO2: if glacier_table.empty: glacier_table = csv_regionO1.loc[ csv_regionO1['O2Region'] == regionO2 ] else: glacier_table = pd.concat( [ glacier_table, csv_regionO1.loc[csv_regionO1['O2Region'] == regionO2], ], axis=0, ) else: if len(rgi_glac_number) < 20: print( '%s glaciers in region %s are included in this model run: %s' % (len(rgi_glac_number), region, rgi_glac_number) ) else: print( '%s glaciers in region %s are included in this model run: %s and more' % (len(rgi_glac_number), region, rgi_glac_number[0:50]) ) rgiid_subset = [ 'RGI60-' + str(region).zfill(2) + '.' + x for x in rgi_glac_number ] rgiid_all = list(csv_regionO1.RGIId.values) rgi_idx = [rgiid_all.index(x) for x in rgiid_subset if x in rgiid_all] if glacier_table.empty: glacier_table = csv_regionO1.loc[rgi_idx] else: glacier_table = pd.concat( [glacier_table, csv_regionO1.loc[rgi_idx]], axis=0 ) glacier_table = glacier_table.copy() # reset the index so that it is in sequential order (0, 1, 2, etc.) glacier_table.reset_index(inplace=True) # drop connectivity 2 for Greenland and Antarctica glacier_table = glacier_table.loc[glacier_table['Connect'].isin([0, 1])] glacier_table.reset_index(drop=True, inplace=True) # change old index to 'O1Index' to be easier to recall what it is glacier_table.rename(columns={'index': 'O1Index'}, inplace=True) # Record the reference date glacier_table['RefDate'] = glacier_table['BgnDate'] # if there is an end date, then roughly average the year enddate_idx = glacier_table.loc[ (glacier_table['EndDate'] > 0), 'EndDate' ].index.values glacier_table.loc[enddate_idx, 'RefDate'] = ( np.mean( ( glacier_table.loc[enddate_idx, ['BgnDate', 'EndDate']].values / 10**4 ).astype(int), axis=1, ).astype(int) * 10**4 + 9999 ) # drop columns of data that is not being used glacier_table.drop(rgi_cols_drop, axis=1, inplace=True) # add column with the O1 glacier numbers glacier_table[rgi_O1Id_colname] = ( glacier_table['RGIId'].str.split('.').apply(pd.Series).loc[:, 1].astype(int) ) glacier_table['rgino_str'] = [x.split('-')[1] for x in glacier_table.RGIId.values] # glacier_table[rgi_glacno_float_colname] = (np.array([np.str.split(glacier_table['RGIId'][x],'-')[1] # for x in range(glacier_table.shape[0])]).astype(float)) glacier_table[rgi_glacno_float_colname] = np.array( [x.split('-')[1] for x in glacier_table['RGIId']] # [np.str.split(glacier_table['RGIId'][x],'-')[1] # for x in range(glacier_table.shape[0])] ).astype(float) # set index name glacier_table.index.name = indexname # Longitude between 0-360deg (no negative) glacier_table['CenLon_360'] = glacier_table['CenLon'] glacier_table.loc[glacier_table['CenLon'] < 0, 'CenLon_360'] = ( 360 + glacier_table.loc[glacier_table['CenLon'] < 0, 'CenLon_360'] ) # Subset glaciers based on their terminus type termtype_values = [] if include_landterm: termtype_values.append(0) # assume dry calving, regenerated, and not assigned are land-terminating termtype_values.append(3) termtype_values.append(4) termtype_values.append(9) if include_tidewater: termtype_values.append(1) # assume shelf-terminating glaciers are tidewater termtype_values.append(5) if include_laketerm: termtype_values.append(2) glacier_table = glacier_table.loc[glacier_table['TermType'].isin(termtype_values)] glacier_table.reset_index(inplace=True, drop=True) # Glacier number with no trailing zeros glacier_table['glacno'] = [ str(int(x.split('-')[1].split('.')[0])) + '.' + x.split('-')[1].split('.')[1] for x in glacier_table.RGIId ] # Remove glaciers below threshold glacier_table = glacier_table.loc[glacier_table['Area'] > min_glac_area_km2, :] glacier_table.reset_index(inplace=True, drop=True) # Remove glaciers that are meant to be skipped if glac_no_skip is not None: glac_no_all = list(glacier_table['glacno']) glac_no_unique = [x for x in glac_no_all if x not in glac_no_skip] unique_idx = [glac_no_all.index(x) for x in glac_no_unique] glacier_table = glacier_table.loc[unique_idx, :] glacier_table.reset_index(inplace=True, drop=True) print( 'This study is focusing on %s glaciers in region %s' % (glacier_table.shape[0], rgi_regionsO1) ) return glacier_table
# OPTION 2: CUSTOMIZE REGIONS USING A SHAPEFILE that specifies the # various regions according to the RGI IDs, i.e., add an # additional column to the RGI table. # [INSERT CODE FOR IMPORTING A SHAPEFILE] # (1) import shapefile with custom boundaries, (2) grab the RGIIDs # of glaciers that are in these boundaries, (3) perform calibration # using these alternative boundaries that may (or may not) be more # representative of regional processes/climate # Note: this is really only important for calibration purposes and # post-processing when you want to show results over specific # regions. # Development Note: if create another method for selecting glaciers, # make sure that update way to select glacier # hypsometry as well.
[docs] def split_list(lst, n=1, option_ordered=1, group_thousands=False): """ Split list into batches for the supercomputer. Parameters ---------- lst : list List that you want to split into separate batches n : int Number of batches to split glaciers into. Returns ------- lst_batches : list list of n lists that have sequential values in each list """ # If batches is more than list, then there will be one glacier in each batch if option_ordered == 1: if n > len(lst): n = len(lst) n_perlist_low = int(len(lst) / n) n_perlist_high = int(np.ceil(len(lst) / n)) lst_copy = lst.copy() count = 0 lst_batches = [] for x in np.arange(n): count += 1 if count <= len(lst) % n: lst_subset = lst_copy[0:n_perlist_high] lst_batches.append(lst_subset) [lst_copy.remove(i) for i in lst_subset] else: lst_subset = lst_copy[0:n_perlist_low] lst_batches.append(lst_subset) [lst_copy.remove(i) for i in lst_subset] else: if n > len(lst): n = len(lst) lst_batches = [[] for x in np.arange(n)] nbatch = 0 for count, x in enumerate(lst): if count % n == 0: nbatch = 0 lst_batches[nbatch].append(x) nbatch += 1 if group_thousands: # get unique sets of thousand glaciers (ie. RGIXX.YY) # this may be preferrable when running script that download glacier directories from oggm # otherwise if two batches contain glacier ids from the same set of thousand glaciers, # two downloads from oggm may run simultaneously and cause conflicts sets = [x[:5] for x in lst for lst in lst_batches] sets = list(set(sets)) lst_batches_th = [] # keep the number of batches, but move items around to not have sets of RGIXX.YY ids in more than one batch for s in sets: merged = [ item for sublist in lst_batches for item in sublist if item[:5] == s ] lst_batches_th.append(merged) # ensure that number of batches doesn't exceed original number while len(lst_batches_th) > n: # move shortest batch to next shortest batch lengths = np.asarray([len(batch) for batch in lst_batches_th]) sorted = lengths.argsort() idx0 = sorted[0] idx1 = sorted[1] lst_batches_th[idx1].extend(lst_batches_th[idx0]) del lst_batches_th[idx0] lst_batches = lst_batches_th return lst_batches