LP DAAC ECOSTRESS

example with ECOSTRESS LSTE Kerchunk Reference File

!mamba install kerchunk -y --quiet
!mamba install h5py=3.2 -y --quiet # Default version in this environment does not work. Must update
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt
import s3fs
import datetime as dt
import logging
import fsspec
import ujson
import requests
from tqdm import tqdm
from glob import glob
import os
import hvplot.xarray

# The xarray produced from the reference file throws a SerializationWarning for each variable. Will need to explore why
#import warnings
#warnings.simplefilter("ignore")
from kerchunk.hdf import SingleHdf5ToZarr
from kerchunk.combine import MultiZarrToZarr
s3_cred_endpoint = {
    'podaac':'https://archive.podaac.earthdata.nasa.gov/s3credentials',
    'lpdaac':'https://data.lpdaac.earthdatacloud.nasa.gov/s3credentials',
    'ornldaac':'https://data.ornldaac.earthdata.nasa.gov/s3credentials',
    'gesdisc':'https://data.gesdisc.earthdata.nasa.gov/s3credentials'
}
def get_temp_creds():
    temp_creds_url = s3_cred_endpoint['lpdaac']
    return requests.get(temp_creds_url).json()
temp_creds_req = get_temp_creds()

Direct Access a Single file

fs = s3fs.S3FileSystem(
    anon=False,
    key=temp_creds_req['accessKeyId'],
    secret=temp_creds_req['secretAccessKey'],
    token=temp_creds_req['sessionToken']
)
url = 's3://lp-prod-protected/ECO_L1B_GEO.002/ECOv002_L1B_GEO_21547_021_20220424T215449_0700_01/ECOv002_L1B_GEO_21547_021_20220424T215449_0700_01.h5'
s3_file_obj = fs.open(url, mode='rb')
import h5py
h5_file = h5py.File(s3_file_obj)
#list(h5_file['Geolocation']['latitude'])
%%time
xr_ds = xr.open_dataset(s3_file_obj, chunks='auto', engine='h5netcdf', backend_kwargs={"mask_and_scale" : False, "decode_times" : False, "decode_timedelta" : False, "use_cftime" : False, "decode_coords" : False})
xr_ds
CPU times: user 134 ms, sys: 35 ms, total: 169 ms
Wall time: 534 ms
<xarray.Dataset>
Dimensions:  ()
Data variables:
    *empty*

Specify a list of S3 URLs (MERRA2, 05/01/2019 - 05/31/2019)

urls = ['s3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190501.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190502.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190503.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190504.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190505.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190506.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190507.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190508.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190509.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190510.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190511.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190512.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190513.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190514.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190515.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190516.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190517.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190518.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190519.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190520.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190521.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190522.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190523.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190524.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190525.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190526.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190527.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190528.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190529.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190530.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190531.nc4']

Generate the reference kerchunk files. These files can take a little time to generate. Do not run if JSON files already exist in ./jsons/ directory.

so = dict(
    mode="rb",
    anon=False,
    default_fill_cache=False,
    default_cache_type="none"
)

#references = []
for u in urls:
    with fs.open(u, **so) as infile:
        h5chunks = SingleHdf5ToZarr(infile, u, inline_threshold=300)
        with open(f"jsons/{u.split('/')[-1]}.json", 'wb') as outfile:
            translate = h5chunks.translate()
            outfile.write(ujson.dumps(translate).encode())
            #references.append(translate)

Create a list with the paths to the reference files

fs_ref_list = fsspec.filesystem('file')
reference_list = sorted([x for x in fs_ref_list.ls('jsons') if '.json' in x])
reference_list

Open the first reference file to read into an xarray dataset

with open(reference_list[0]) as j:
    reference = ujson.load(j)

Set configurations options

s_opts = {'skip_instance_cache':True}   #json
r_opts = {'anon':False,          
          'key':temp_creds_req['accessKeyId'], 
          'secret':temp_creds_req['secretAccessKey'], 
          'token':temp_creds_req['sessionToken']}    #ncfiles
fs_single = fsspec.filesystem("reference", 
                       fo=reference, 
                       ref_storage_args=s_opts,
                       remote_protocol='s3', 
                       remote_options=r_opts)

Read in a single reference object. Get lots of SerializationWarnings. Ignored using the warning package, but the fill value, data range, min value, and max value DO NOT match the source file.

%%time
m = fs_single.get_mapper("")
ds_single = xr.open_dataset(m, engine="zarr", backend_kwargs={'consolidated':False}, chunks={})
ds_single
#ds_single

Combine the single reference files created above into a single time series reference file THIS DID NOT WORK

# mzz = MultiZarrToZarr(
#     reference_list,
#     remote_protocol="s3",
#     remote_options=r_opts,
#     concat_dims=["time"]
# )

# out = mzz.translate()
# fs_mzz = fsspec.filesystem("reference",
#                            fo=out,
#                            ref_storage_args=s_opts,
#                            remote_protocol='s3',
#                            remote_options=r_opts)
# m = fs_mzz.get_mapper("")
# ds_multi = xr.open_dataset(m, engine="zarr", backend_kwargs={'consolidated':False}, chunks={})

Combine the single reference files created above into a single time series reference file THIS DID WORK

%%time
ds_k =[]

for ref in reference_list:
    s_opts = s_opts
    r_opts = r_opts
    fs = fsspec.filesystem(
        "reference", 
        fo=ref, 
        ref_storage_args=s_opts,
        remote_protocol='s3', 
        remote_options=r_opts)
    m = fs.get_mapper("")
    ds_k.append(xr.open_dataset(m, engine="zarr", backend_kwargs={'consolidated':False}, chunks={}))
    
ds_multi = xr.concat(ds_k, dim='time')
    
ds_multi

Agains, the fill value, data range, min value, and max value DO NOT match the source file

#ds_multi = xr.concat(ds_k, dim='time')
#ds_multi
ds_multi['T500']
# Commenting for quarto site render
# ds_multi['T500'].hvplot.image(x='lon', y='lat')