data_generators.py

from netCDF4 import Dataset, num2date
import numpy as np
import tensorflow as tf

import glob
import itertools as it
import json
import os
import pickle

import utility

import xarray as xr


"""
    #TODO: Simplify code to make it non dependent on tensorflow. Convert tparams and mparams into kwargs

    Below we provide an example of how to use our Tensorflow based Pipeline for extracting weather and rain data. 
    The user can specify features such as location, time, batch/all at once.
    The generator automatically aligns rain and model field data, and masks out non valid rain readings, normalizes data etc etc
    
    import data_generators
    import utility
    import hparameters
    import pandas as pd
    import numpy as np
    
    # Attributes to be Changed By User
    li_locations = ['London']

    start_date = "2006-03-10" #YYYY-MM-DD           #Start date for data you want to sample
    end_date =  "2009-08-02"                        #End date for sampled data

    batch_size = 4                                 # To prevent any memory issues
    window_shift = 28                               # E.g. in order to load data as batches of 28 days of data

    data_dir = "./Data/Rain_Data_Nov19"
    rain_fn = "Rain_Data/rr_ens_mean_0.1deg_reg_v20.0e_197901-201907_djf_uk.nc"
    mf_fn = "model_fields_linearly_interpolated_1979-2019.nc"

    # Fixed ATTRIBUTES -- Do Not Change
    ## File paths to data
    ## Model fields to extract from model field dataset 
    vars_for_feature = ['unknown_local_param_137_128', 'unknown_local_param_133_128', 'air_temperature', 'geopotential', 'x_wind', 'y_wind' ]
    normalization_scales = {        "rain":4.69872+0.5,
                                    "model_fields": np.array([6.805,
                                                            0.001786,
                                                              5.458,
                                                              1678.2178,
                                                                5.107268,
                                                                4.764533]) 
                                                #- unknown_local_param_137_128
                                                # - unknown_local_param_133_128,  
                                                # # - air_temperature, 
                                                # # - geopotential
                                                # - x_wind, 
                                                # # - y_wind
        }
    normalization_shift = {
                                "rain":2.844,
                                "model_fields": np.array([15.442,
                                                            0.003758,
                                                            274.833,
                                                            54309.66,
                                                            3.08158,
                                                            0.54810]) }
    mask_fill_value = {
                                    "rain":0.0,
                                    "model_field":0.0 
        }

    ## Locations to extract
    region_grid_params:{
            'outer_box_dims':[16,16],
            'inner_box_dims':[4,4],
            'vertical_shift':4,
            'horizontal_shift':4,
            'input_image_shape':[100,140]}
    vertical_slides = (self.params['region_grid_params']['input_image_shape'][0] - self.params['region_grid_params']['outer_box_dims'][0] +1 )// self.params['region_grid_params']['vertical_shift']
    horizontal_slides = (self.params['region_grid_params']['input_image_shape'][1] - self.params['region_grid_params']['outer_box_dims'][1] +1 ) // self.params['region_grid_params']['horizontal_shift']
    region_grid_params['slides_v_h'] = [vertical_slides, horizontal_slides]

    ## Times 
    target_start_date = np.datetime64('1950-01-01') + np.timedelta64(10592,'D') #E-obs records start from 1950 
    feature_start_date = np.datetime64('1970-01-01') + np.timedelta64(78888, 'h') #ERA5 records start from 1979
        
    date_tss = pd.date_range( end=test_end_date, start=start_date, freq='D', normalize=True) #Timestamps for target rain
    timestamps = list ( (date_tss - pd.Timestamp("1970-01-01") ) // pd.Timedelta('1s') )
        
    ## Data Format
    all_at_once = True                              # False would make dataset come out in batches
    batches = (( np.timedelta64(train_end_date - start_date,'D')).astype(int) // window_shift) //batch_size

    ## Param Dictionaries
    t_params = { 
                'data_dir': data_dir,
                'rain_fn':rain_fn,
                'mf_fn':mf_fn,
                
                'batch_size':1,
                'time_sequential:True,
                
                'li_locations':li_locations,

                'start_date':start_date,
                'parallel_calls':-1, 
                'batches':batches,

                'normalization_shift':normalization_shift,
                'normalization_scales':normalization_scales,
                'mask_fill_value':mask_fill_value}
    
    m_params = {
        'time_sequential':True,
        'region_grid_params':region_grid_params}

    # Instatiate Dataloader
    era5_eobs = data_generators.Era5_Eobs( t_params, m_params)
    
    # Sets up internal dataloader parameters for to extract from specific location(s)
    era5_eobs.location_size_cal(tparams.get( 'li_locations', ['London'] ) )
    
    era5_eobs_gen = era5_eobs.load_data_era5eobs(  
         t_params['batches'],
         t_params['start_date'],
         drop_remainder=False )
    
    era5_eobs_gen = era5_eobs_gen.take(t_params['batches'])
        # format: ( (model_field, (rain_data, mask) ), idx_of_central_position )
                                                # model_field : T x H x W x 4*6 #0th dim represents days. #3rd dim represents 6 features and four quarday-periods periods in a day
                                                # rain_data : T x H x W # -> 0th dimension represents days
                                                # idx_of_central_position:  2

    
    #OPTION 1: Extract Time Slices of data 
    for datum in iter(era5_eobs_gen):
        f(datum) = output
    
    # Option 2: Load Data all at once
    all_data = list(iter(era5_eobs_gen))
    ( model_fields, (rain_data, mask), idx_of_central_pos) = zip(*all_data)
    

"""

# region -- Era5_Eobs
class Generator():
    """
        Base class for Generator classes
        Example of how to use:
            fn = "Data/Rain_Data/rr_ens_mean_0.1deg_reg_v20.0e_197901-201907_djf_uk.nc"
            rain_gen = Generator_rain(fn, all_at_once=True)
            datum = next(iter(grib_gen))
    """
    
    def __init__(self, fp, all_at_once=False):
        """Extendable Class handling the generation of model field and rain data
            from E-Obs and ERA5 datasets

        Args:
            fp (str): Filepath of netCDF4 file containing data.
            all_at_once (bool, optional): Whether or not to load all the data in RAM or not. Defaults to False.
            
        """        
        self.generator = None
        self.all_at_once = all_at_once
        self.fp = fp
        self.city_latlon = {
            "London": [51.5074, -0.1278],
            "Cardiff": [51.4816 + 0.15, -3.1791 -0.05], #1st Rainiest
            "Glasgow": [55.8642,  -4.2518], #3rd rainiest
            "Lancaster":[54.466, -2.8007], #2nd hieghest
            "Bradford": [53.7960, -1.7594], #3rd highest
            "Manchester":[53.4808, -2.2426], #15th rainiest
            "Birmingham":[52.4862, -1.8904], #25th
            "Liverpool":[53.4084 , -2.9916 +0.1 ], #18th rainiest
            "Leeds":[ 53.8008, -1.5491 ], #8th
            "Edinburgh": [55.9533, -3.1883],
            "Belfast": [54.5973, -5.9301], #25
            "Dublin": [53.3498, -6.2603],
            "LakeDistrict":[54.4500,-3.100],
            "Newry":[54.1751, -6.3402],
            "Preston":[53.7632, -2.7031 ],
            "Truro":[50.2632, -5.0510],
            "Bangor":[54.2274 - 0, -4.1293 - 0.3],
            "Plymouth":[50.3755 + 0.1, -4.1427],
            "Norwich": [52.6309, 1.2974],
            "StDavids":[51.8812+0.05, -5.2660+0.05] ,
            "Swansea":[51.6214+0.05,-3.9436],
            "Lisburn":[54.5162,-6.058],
            "Salford":[53.4875, -2.2901],
            "Aberdeen":[57.1497,-2.0943-0.05],
            "Stirling":[56.1165, -3.9369],
            "Hull":[53.7676+0.05, 0.3274]
            }
        
        #The longitude lattitude grid for the 0.1 degree E-obs and rainfall data
        self.latitude_array = np.linspace(58.95, 49.05, 100)
        self.longitude_array = np.linspace(-10.95, 2.95, 140)
        
        # Retrieving information on temporal length of  dataset        
        with Dataset(self.fp, "r+", format="NETCDF4") as ds:
            self.data_len = ds.dimensions['time'].size
                
    def yield_all(self):
        pass

    def yield_iter(self):
        pass
    
    def __call__(self, ):
        if(self.all_at_once):
            return self.yield_all()
        else:
            return self.yield_iter()

    
    def find_idxs_of_loc(self, loc="London"):
        """Returns the grid indexes on the 2D map of the UK which correspond to the location (loc) point

        Args:
            loc (str, optional): name of the location. Defaults to "London".

        Returns:
            tuple: Contains indexes (h1,w1) for the location (loc)
        """        
        coordinates = self.city_latlon[loc]
        indexes = self.find_nearest_latitude_longitude( coordinates)  # (1,1)
        return indexes

    def find_idx_of_loc_region(self, loc, region_grid_params):
        """ Returns the the indexes defining gridded box that surrounds the location of interests

            Raises:
                ValueError: [If the location of interest is too close to the border for evaluation]

            Returns:
                tuple: Returns a tuple ( [upper_h, lower_h], [left_w, right_w] ), defining the grid box that 
                    surrounds the location (loc)
        """
        
        city_idxs = self.find_idxs_of_loc(loc) #[h,w]
        
        # Checking that central region of interest is not too close to the border
        bool_regioncheck1 = np.array(city_idxs) <  np.array(region_grid_params['outer_box_dims']) - np.array(region_grid_params['inner_box_dims'])
        bool_regioncheck2 = np.array(region_grid_params['input_image_shape']) - np.array(city_idxs) < np.array(region_grid_params['inner_box_dims'])
        if bool_regioncheck1.any() or bool_regioncheck2.any(): raise ValueError("The specified region is too close to the border")

        # Defining the span, in all directions, from the central region
        if( region_grid_params['outer_box_dims'][0]%2 == 0 ):
            h_up_span = region_grid_params['outer_box_dims'][0]//2 
            h_down_span = h_up_span
        else:
            h_up_span = region_grid_params['outer_box_dims'][0]//2
            h_down_span = region_grid_params['outer_box_dims'][0]//2 + 1

        if( region_grid_params['outer_box_dims'][1]%2 == 0 ):
            w_left_span = region_grid_params['outer_box_dims'][1]//2 
            w_right_span = w_left_span
        else:
            w_left_span = region_grid_params['outer_box_dims'][1]//2
            w_right_span = region_grid_params['outer_box_dims'][1]//2 + 1
        
        #Defining outer_boundaries
        upper_h = city_idxs[0] - h_up_span
        lower_h = city_idxs[0] + h_down_span

        left_w = city_idxs[1] - w_left_span
        right_w = city_idxs[1] + w_right_span
        
        return ( [upper_h, lower_h], [left_w, right_w] )

    def find_nearest_latitude_longitude(self, lat_lon):
        """Given specific lat_lon, this method finds the closest long/lat points on the
            0.1degree grid our input/target data is defined on

        Args:
            lat_lon (tuple): tuple containing the lat and lon values of interest

        Returns:
            tuple: tuple containing the idx_h and idx_w values that detail the posiiton on lat_lon on the 
                0.1degree grid on which the ERA5 and E-Obvs data is defined
        """        
        latitude_index =    np.abs(self.latitude_array - lat_lon[0] ).argmin()
        longitude_index =   np.abs(self.longitude_array - lat_lon[1]).argmin()

        return (latitude_index, longitude_index)
    
    def get_locs_for_whole_map(self, region_grid_params):
        """This function returns a list of boundaries which can be used to extract all patches
            from the 2D map

            Args:
                region_grid_params (dictionary): a dictioary containing information on the sizes of 
                    patches to be extract from the main image

            Returns:
                list : return a list of of tuples defining the boundaries of the region
                        of the form [ ([upper_h, lower_h]. [left_w, right_w]), ... ]
        """       
        input_image_shape = region_grid_params['input_image_shape']
        h_shift = region_grid_params['vertical_shift']
        w_shift = region_grid_params['horizontal_shift']
        h_span, w_span = region_grid_params['outer_box_dims']

        #list of values for upper_h and lower_h
        range_h = np.arange(0, input_image_shape[0]-h_span+1, step=h_shift, dtype=np.int32 ) 
        # list of pairs of values (upper_h, lower_h)
        li_range_h_pairs = [ [range_h[i], range_h[i]+h_span ] for i in range(0,len(range_h))]
        
        #list of values for left_w and right_w
        range_w = np.arange(0, input_image_shape[1]-w_span+1, step=w_shift, dtype=np.int32)
        # list of pairs of values (left_w, right_w)
        li_range_w_pairs = [ [range_w[i], range_w[i]+w_span ] for i in range(0,len(range_w))]

        li_boundaries = list( it.product( li_range_h_pairs, li_range_w_pairs ) )

        # A list of points on grid to remove representing non-land (water) surface on a UK part
        boundaries_to_remove = [ ([0,16],[0,16]), ([0,16],[4,20]), ([0,16],[8,24]), ([0,16],[12,28]), ([0,16],[16,32]), ([0,16],[20,36]), ([0,16],[24,40]), ([0,16],[28,44]),  ([0,16],[32,48]),                           ([0,16],[64,80]),([0,16],[68,84]),([0,16],[72,88]), ([0,16],[76,92]), ([0,16],[80,96]), ([0,16],[84,100]), ([0,16],[88,104]),([0,16],[92,108]),([0,16],[96,112]), ([0,16],[100,116]),([0,16],[104,120]),([0,16],[108,124]), ([0,16],[112,128]), ([0,16],[116,132]), ([0,16],[120,136]), ([0,16],[124,140]),
                                ([4,20],[0,16]), ([4,20],[4,20]), ([4,20],[8,24]), ([4,20],[12,28]), ([4,20],[16,32]), ([4,20],[20,36]), ([4,20],[24,40]), ([4,20],[28,44]),  ([4,20],[32,48]),                             ([4,20],[76,92]), ([4,20],[80,96]), ([4,20],[84,100]),([4,20],[88,104]),([4,20],[92,108]),([4,20],[96,112]),([4,20],[100,116]), ([4,20],[104,120]),([4,20],[108,124]),([4,20],[112,128]), ([4,20],[116,132]), ([4,20],[120,136]), ([4,20],[124,140]),
                                   ([8,24],[0,16]), ([8,24],[4,20]), ([8,24],[8,24]), ([8,24],[12,28]),  ([8,24],[16,32]), ([8,24],[20,36]), ([8,24],[24,40]), ([8,24],[28,44]),  ([8,24],[32,48]),                                 ([8,24],[96,112]),([8,24],[100,116]),([8,24],[104,120]),([8,24],[108,124]),([8,24],[112,128]), ([8,24],[116,132]), ([8,24],[120,136]), ([8,24],[124,140]),
                                ([12,28],[0,16]), ([12,28],[4,20]), ([12,28],[8,24]), ([12,28],[12,28]),  ([12,28],[16,32]), ([12,28],[20,36]), ([12,28],[24,40]), ([12,28],[28,44]),  ([12,28],[32,48]),                           ([12,28],[96,112]),([12,28],[100,116]),([12,28],[104,120]),([12,28],[108,124]),([12,28],[112,128]),([12,28],[116,132]), ([12,28],[120,136]), ([12,28],[124,140]),
                                   ([16,32],[0,16]), ([16,32],[4,20]), ([16,32],[8,24]), ([16,32],[12,28]),([16,32],[16,32]), ([16,32],[20,36]), ([16,32],[24,40]), ([16,32],[28,44]),  ([16,32],[32,48]),                           ([16,32],[96,112]),([16,32],[100,116]),([16,32],[104,120]),([16,32],[108,124]),([16,32],[112,128]), ([16,32],[120,136]), ([16,32],[124,140]),
                                   ([20,36],[0,16]), ([20,36],[4,20]), ([20,36],[8,24]), ([20,36],[20,36]),([20,36],[16,32]), ([20,36],[20,36]), ([20,36],[24,40]), ([20,36],[28,44]),  ([20,36],[32,48]),                           ([20,36],[96,112]),([20,36],[100,116]),([20,36],[104,120]),([20,36],[108,124]),([20,36],[112,128]),([20,36],[116,132]), ([20,36],[124,140]),
                                   ([24,40],[0,16]), ([24,40],[4,20]), ([24,40],[8,24]), ([24,40],[12,28]),([24,40],[16,32]), ([24,40],[20,36]), ([24,40],[24,40]), ([24,40],[28,44]),  ([24,40],[32,48]),                                  ([24,40],[100,116]),([24,40],[104,120]),([24,40],[108,124]),([24,40],[112,128]),([24,40],[116,132]), ([24,40],[120,136]) , ([24,40],[124,140]),
                                                                                                                                                                                                                                            ([28,44],[100,116]),([28,44],[104,120]),([28,44],[108,124]),([28,44],[112,128]),([28,44],[116,132]), ([28,44],[120,136]), ([28,44],[124,140]),                                                                                                                                                                                                                                                                                                            
                                                                                                                                                                                                                                                                ([32,48],[104,120]),([32,48],[108,124]),([32,48],[112,128]),([32,48],[116,132]), ([32,48],[120,136]),([32,48],[124,140]),                                   
                                                                                                                                                                                                                                                                                    ([36,52],[108,124]),([36,52],[112,128]),([36,52],[116,132]), ([36,52],[120,136]),([36,52],[124,140]),                                   
                                                                                                                                                                                                                                                                                                    ([40,56],[112,128]),([40,56],[116,132]),([40,56],[120,136]),([40,56],[124,140]),      
                                                                                                                                                                                                                                                                                                                                           ([44,60],[120,136]),([44,60],[124,140]),                                   
                                                                                                                                                                                                                                                                                                                                           ([48,64],[120,136]),([48,64],[124,140]),                                   
                                                                                                                                                                                                                                            ([20,36],[14,32]), 
                                                                                                                                                                                                                                            ([24,40],[14,32]), 
                                   ([80,96],[0,16]), ([80,96],[4,20]), ([80,96],[8,24]), ([80,96],[12,28]), ([80,96],[14,32]), ([80,96],[18,36]), ([80,96],[22,40]),
                                   ([84,100],[0,16]), ([84,100],[4,20]), ([84,100],[8,24]), ([84,100],[12,28]),([84,100],[14,32]), ([84,100],[18,36]), ([84,100],[22,40])  ]

        li_boundaries = [ x for x in li_boundaries if x not in boundaries_to_remove]
        return li_boundaries 

class Generator_rain(Generator):
    """ A generator for E-obs 0.1 degree rain data
    
    Returns:
        A python generator for the rain data
        
    """
    def __init__(self, **generator_params ):
        super(Generator_rain, self).__init__(**generator_params)
        
    def yield_all(self):
        """ Return all data at once
        """
        with Dataset(self.fp, "r", format="NETCDF4",keepweakref=True) as ds:
            _data = ds.variables['rr'][:]
            yield np.ma.getdata(_data)[::-1, :], np.logical_not( np.ma.getmask(_data) )[::-1,:]
            
    def yield_iter(self):
        """ Return data in chunks"""
        ds = Dataset(self.fp, "r", format="NETCDF4", keepweakref=True)
        for chunk in ds.variables['rr'][:]:
            data = np.ma.getdata(chunk)
            mask = np.logical_not( np.ma.getmask(chunk) )
            yield data[ ::-1 , :], mask[::-1, :]


    def __call__(self):
        return self.yield_iter()
    
class Generator_mf(Generator):
    """Creates a generator for the model_fields_dataset
    """

    def __init__(self, vars_for_feature, seq_len=100 ,**generator_params):
        """[summary]

        Args:
            generator_params : list of params to pass to base Generator class
        """        
        super(Generator_mf, self).__init__(**generator_params)

        self.vars_for_feature = vars_for_feature #['unknown_local_param_137_128', 'unknown_local_param_133_128', 'air_temperature', 'geopotential', 'x_wind', 'y_wind' ]       
        self.seq_len = seq_len*25 if seq_len else 1400
        self.start_idx = 0
        self.end_idx =0 
        #self.ds = Dataset(self.fp, "r", format="NETCDF4")


    def yield_all(self):
        
        xr_gn = xr.open_dataset(self.fp, cache=False, decode_times=False, decode_cf=False)

        slice_t = slice( self.start_idx , self.end_idx )
        slice_h = slice(1,103-2 )
        slice_w = slice(2,144-2)
        
        xr_gn =  xr_gn.isel(time=slice_t, latitude=slice_h, longitude=slice_w)
        
        next_marray = [ xr_gn[name].to_masked_array(copy=False) for name in self.vars_for_feature ]

        list_datamask = [(np.ma.getdata(_mar), np.ma.getmask(_mar)) for _mar in next_marray]
            
        _data, _masks = list(zip(*list_datamask))
        _masks = [ np.logical_not(_mask_val) for _mask_val in _masks] 
        stacked_data = np.stack(_data, axis=-1)
        stacked_masks = np.stack(_masks, axis=-1)

        yield stacked_data, stacked_masks #(T, 100,140,6) 

    def yield_iter(self):
        xr_gn = xr.open_dataset(self.fp, cache=False, decode_times=False, decode_cf=False)
        
        idx = self.start_idx
        
        while idx < self.data_len:

            adj_seq_len = min(self.seq_len, self.data_len - idx )

            _slice = slice( idx , idx  + adj_seq_len)
            next_marray = [ xr_gn[name].isel(time=_slice).to_masked_array(copy=True) for name in self.vars_for_feature ]
            
            list_datamask = [(np.ma.getdata(_mar), np.ma.getmask(_mar)) for _mar in next_marray]
            
            _data, _masks = list(zip(*list_datamask))
            _masks = [ np.logical_not(_mask_val) for _mask_val in _masks] 
            stacked_data = np.stack(_data, axis=-1)
            stacked_masks = np.stack(_masks, axis=-1)

            idx += adj_seq_len
            
            yield stacked_data[ :, 1:-2, 2:-2, :], stacked_masks[ :, 1:-2 , 2:-2, :] #(100,140,6) 

class Era5_Eobs():

    """Produces Tensorflow Datasets for the ERA5 and E-obs dataset
    
    """
    def __init__(self, t_params, m_params): 
        
        self.t_params = t_params
        self.m_params = m_params

        self.time_sequential = m_params['time_sequential']

        data_dir = self.t_params['data_dir']

        # Create python generator for rain data
        fp_rain = data_dir+"/" + self.t_params.get('rain_fn',"eobs_true_rainfall_197901-201907_uk.nc")
        self.rain_data = Generator_rain(fp=fp_rain, all_at_once=t_params.get('all_at_once',False))

        # Create python generator for model field data 
        mf_fp = data_dir + "/" + self.t_params.get('mf_fn', "model_fields_linearly_interpolated_1979-2019.nc")
        self.mf_data = Generator_mf(fp=mf_fp, vars_for_feature=self.t_params['vars_for_feature'], all_at_once=t_params.get('all_at_once',False),
                            seq_len=self.t_params.get('lookback_feature',None) )

        # Update information on the locations of interest to extract data from
        self.location_size_calc(self.t_params.get('li_locations',None))

    def location_size_calc(self, custom_location=None): 
        """ Updates list of locations to evaluate on

            Args:
                custom_location (list optional): A list of locations to evaluate on
        """        
        model_settings = self.m_params['model_type_settings']
        
        # If noe time sequential then model is not operating on sequential time spans for a single location
       
        if custom_location != None:
            self.li_loc = custom_location
        else:
            self.li_loc = utility.location_getter(model_settings)

        self.loc_count = len( self.li_loc ) if \
                self.li_loc != ["All"] else \
                    len( self.rain_data.get_locs_for_whole_map(self.m_params['region_grid_params']))
        
            
    def load_data_era5eobs(self, batch_count, start_date,_num_parallel_calls=-1, prefetch=-1, drop_remainder=True):
        """Produces Tensorflow Datasets for the ERA5 and E-obs dataset

            Args:
                t_params (dict): dictionary for parameters related to training/testing
                m_params (dict): dictionary for parameters related to model
                batch_count int: Number of batches to extract for evaluation
                
                _num_parallel_calls (int, optional): Number of parallel calls to use in tensorflow dataset loading operations. Defaults to -1.
                data_dir (str, optional): path of Data directory. Defaults to "./Data/Rain_Data_Nov19".

            Raises:
                NotImplementedError: [description]
                ValueError: [description]
                NotImplementedError: [description]
                ValueError: [description]

            Returns:
                tf.dataset: Dataset containing ERA5 and Eobs predictions
        """    
        
        # Retreiving one index for each of the feature and target data. This index indicates the first value in the dataset to use
        start_idx_feat, start_idx_tar = self.get_start_idx(start_date)
        self.mf_data.start_idx = start_idx_feat

        
        # region - Preparing feature model fields        
        ds_feat = tf.data.Dataset.from_generator( self.mf_data , output_types=(tf.float16, tf.bool),
                     output_shapes=( tf.TensorShape([None, None, None, None]),tf.TensorShape([None, None, None, None])) ) #(values, mask) 
        ds_feat = ds_feat.unbatch()
        
        if self.m_params['time_sequential'] == True:
            ds_feat = ds_feat.window(size = self.t_params.get('lookback_feature',28) , stride=1, shift=self.t_params.get('lookback_feature',28) , drop_remainder=drop_remainder )
            ds_feat = ds_feat.flat_map( lambda *window: tf.data.Dataset.zip( tuple([w.batch(self.t_params.get('lookback_feature',28) ) for w in window ] ) ) )  # shape (lookback,h, w, 6)
            ds_feat = ds_feat.map( lambda arr_data, arr_mask: self.mf_normalize_mask( arr_data, arr_mask), num_parallel_calls= _num_parallel_calls) 
        else:
            ds_feat = ds_feat.batch(4)
            ds_feat = ds_feat.map( lambda arr_data, arr_mask: self.mf_normalize_mask( arr_data, arr_mask), num_parallel_calls= _num_parallel_calls) 
            ds_feat = ds_feat.map( lambda arr_data: tf.reshape(tf.transpose(arr_data,[1,2,0,3]), [100,140,24])  , num_parallel_calls=_num_parallel_calls )
            
        
        # endregion

        # region - Preparing Eobs target_rain_data   
        ds_tar = tf.data.Dataset.from_generator( self.rain_data, output_types=(tf.float32, tf.bool), output_shapes=( tf.TensorShape([None, None]), tf.TensorShape([ None, None])) ) # (values, mask) 
        ds_tar = ds_tar.skip(start_idx_tar)     #skipping to correct point

        if self.m_params['time_sequential'] == True:
            ds_tar = ds_tar.window(size = self.t_params.get('lookback_target',128) , stride=1, shift=self.t_params['window_shift'] , drop_remainder=drop_remainder )
            ds_tar = ds_tar.flat_map( lambda *window: tf.data.Dataset.zip( tuple([ w.batch(self.t_params.get('lookback_target',128) ) for w in window ] ) ) ) # shape (lookback,h, w)
        else:
            ds_tar = ds_tar

        ds_tar = ds_tar.map( lambda _vals, _mask: self.mask_rain( _vals, _mask ), num_parallel_calls=_num_parallel_calls ) # (values, mask)
        # endregion

        # Combining datasets
        ds = tf.data.Dataset.zip( (ds_feat, ds_tar) ) #( model_fields, (rain, rain_mask) ) 
        
        #if self.time_sequential == True:
        ds, idx_loc_in_region = self.location_extractor( ds, self.li_loc, batch_count  )
        ds = ds.prefetch(prefetch)
        return ds, idx_loc_in_region
        
        # else:
        #     ds = ds.map( lambda mf, rain_mask: tuple( [tf.expand_dims(mf,0), tf.expand_dims(rain_mask[0],0), tf.expand_dims(rain_mask[1],0)] )  , num_parallel_calls= _num_parallel_calls)
        #     ds = ds.prefetch(prefetch)
        #     return ds, None        

    def get_start_idx(self, start_date):
        """ Returns two indexes
                The first index is the idx at which to start extracting data from the feature dataset
                The second index is the idx at which to start extracting data from the target dataset
            Args:
                start_date (np.datetime64): Start date for evaluation
            Returns:
                tuple (int, int): starting index for the feature, starting index for the target data
        """        

        feature_start_date = self.t_params['feature_start_date']
        target_start_date = self.t_params['target_start_date']

        feat_days_diff = np.timedelta64(start_date - feature_start_date,'6h').astype(int)
        tar_days_diff = np.timedelta64(start_date - target_start_date, 'D').astype(int)

        feat_start_idx = feat_days_diff #since the feature comes in four hour chunks
        tar_start_idx = tar_days_diff 

        return feat_start_idx, tar_start_idx
    
    def mask_rain(self, arr_rain, arr_mask):
        """Mask rain by applying fill_value to masked points

            Args:
                arr_rain (tensor): rain_values
                arr_mask (tesor): rain_mask
                
            Returns:
                tuple: Tuple containing masked rain and the mask values
        """            
    
        arr_rain = tf.where( arr_mask, arr_rain, self.t_params['mask_fill_value']['rain'] )

        return arr_rain, arr_mask

    def mf_normalize_mask(self, arr_data, arr_mask):
        """Normalize and Mask the model field data

            Args:
                arr_data (tensor): model field data as tensor
                arr_mask (tesor): rain_mask

            Returns:
                tensor: Masked and normalized model field data
        """

        arr_data = tf.subtract( arr_data, self.t_params['normalization_shift']['model_fields'])    #shift
        arr_data = tf.divide( arr_data, self.t_params['normalization_scales']['model_fields'])       #divide
        
        arr_data = tf.where( arr_mask, arr_data, self.t_params['mask_fill_value']['model_field'])
        return arr_data #(h,w,c)

    def location_extractor(self, ds, locations, batch_count):
        """Extracts the temporal slice of patches corresponding to the locations of interest 

                Args:
                    ds (tf.Data.dataset): dataset containing temporal slices of the regions surrounding the locations of interest
                    locations (list): list of locations (strings) to extract

                Returns:
                    tuple: (tf.data.Dataset, [int, int] ) tuple containing dataset and [h,w] of indexes of the central region
        """        
        

        ds = ds.map( lambda mf, rain_mask: tuple( [mf, rain_mask[0], rain_mask[1]] ), num_parallel_calls=-1)   
        
        # list of central h,w indexes from which to extract the region around
        if locations == ["All"]:
            li_hw_idxs = self.rain_data.get_locs_for_whole_map( self.m_params['region_grid_params'] ) #[ ([upper_h, lower_h]. [left_w, right_w]), ... ]
        else:
            li_hw_idxs = [ self.rain_data.find_idx_of_loc_region( _loc, self.m_params['region_grid_params'] ) for _loc in locations ] #[ (h_idx,w_idx), ... ]
        
        # Creating seperate datasets for each location
        li_ds = [ ds.map( lambda mf, rain, rmask : self.select_region(mf, rain, rmask, _idx[0], _idx[1]), num_parallel_calls=-1) for _idx in li_hw_idxs ]
        
        # Concatenating all datasets for each location
        batches_per_loc = int(batch_count/len(li_hw_idxs))
        for idx in range(len(li_ds)):
            li_ds[idx] = li_ds[idx].unbatch().batch( self.t_params['batch_size'], drop_remainder=True ).take(batches_per_loc)
            if idx==0:
                ds = li_ds[0]
            else:
                ds = ds.concatenate( li_ds[idx] )
        
        # pair of indexes locating the central location within the grid region extracted for any location
        idx_loc_in_region = np.floor_divide( self.m_params['region_grid_params']['outer_box_dims'], 2) #This specifies the index of the central location of interest within the (h,w) patch    
        return ds, idx_loc_in_region
    
    def select_region( self, mf, rain, rain_mask, h_idxs, w_idxs):
        """ Extract the region relating to a [h_idxs, w_idxs] pair

            Args:
                mf : model field data
                rain : target rain data
                rain_mask : target rain mask
                h_idxs : int
                w_idxs : int

            Returns:
                tf.data.Dataset: 
        """

        """
            idx_h,idx_w: refer to the top left right index for the square region of interest this includes the region which is removed after cropping to calculate the loss during train step
        """
    
        mf = mf[ ..., h_idxs[0]:h_idxs[1] , w_idxs[0]:w_idxs[1] , : ]
        rain = rain[ ..., h_idxs[0]:h_idxs[1] , w_idxs[0]:w_idxs[1] ]
        rain_mask = rain_mask[ ..., h_idxs[0]:h_idxs[1] , w_idxs[0]:w_idxs[1] ]
            
        return tf.expand_dims(mf,axis=0), tf.expand_dims(rain,axis=0), tf.expand_dims(rain_mask,axis=0) #Note: expand_dim for unbatch/batch compatibility
# endregion