!mamba install kerchunk -y --quietLP DAAC ECOSTRESS
example with ECOSTRESS LSTE Kerchunk Reference File
!mamba install h5py=3.2 -y --quiet # Default version in this environment does not work. Must updateimport numpy as np
import xarray as xr
import matplotlib.pyplot as plt
import s3fs
import datetime as dt
import logging
import fsspec
import ujson
import requests
from tqdm import tqdm
from glob import glob
import os
import hvplot.xarray
# The xarray produced from the reference file throws a SerializationWarning for each variable. Will need to explore why
#import warnings
#warnings.simplefilter("ignore")from kerchunk.hdf import SingleHdf5ToZarr
from kerchunk.combine import MultiZarrToZarrs3_cred_endpoint = {
'podaac':'https://archive.podaac.earthdata.nasa.gov/s3credentials',
'lpdaac':'https://data.lpdaac.earthdatacloud.nasa.gov/s3credentials',
'ornldaac':'https://data.ornldaac.earthdata.nasa.gov/s3credentials',
'gesdisc':'https://data.gesdisc.earthdata.nasa.gov/s3credentials'
}def get_temp_creds():
temp_creds_url = s3_cred_endpoint['lpdaac']
return requests.get(temp_creds_url).json()temp_creds_req = get_temp_creds()Direct Access a Single file
fs = s3fs.S3FileSystem(
anon=False,
key=temp_creds_req['accessKeyId'],
secret=temp_creds_req['secretAccessKey'],
token=temp_creds_req['sessionToken']
)url = 's3://lp-prod-protected/ECO_L1B_GEO.002/ECOv002_L1B_GEO_21547_021_20220424T215449_0700_01/ECOv002_L1B_GEO_21547_021_20220424T215449_0700_01.h5's3_file_obj = fs.open(url, mode='rb')import h5pyh5_file = h5py.File(s3_file_obj)#list(h5_file['Geolocation']['latitude'])%%time
xr_ds = xr.open_dataset(s3_file_obj, chunks='auto', engine='h5netcdf', backend_kwargs={"mask_and_scale" : False, "decode_times" : False, "decode_timedelta" : False, "use_cftime" : False, "decode_coords" : False})
xr_dsCPU times: user 134 ms, sys: 35 ms, total: 169 ms
Wall time: 534 ms
<xarray.Dataset>
Dimensions: ()
Data variables:
*empty*Specify a list of S3 URLs (MERRA2, 05/01/2019 - 05/31/2019)
urls = ['s3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190501.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190502.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190503.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190504.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190505.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190506.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190507.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190508.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190509.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190510.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190511.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190512.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190513.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190514.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190515.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190516.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190517.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190518.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190519.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190520.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190521.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190522.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190523.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190524.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190525.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190526.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190527.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190528.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190529.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190530.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190531.nc4']Generate the reference kerchunk files. These files can take a little time to generate. Do not run if JSON files already exist in ./jsons/ directory.
so = dict(
mode="rb",
anon=False,
default_fill_cache=False,
default_cache_type="none"
)
#references = []
for u in urls:
with fs.open(u, **so) as infile:
h5chunks = SingleHdf5ToZarr(infile, u, inline_threshold=300)
with open(f"jsons/{u.split('/')[-1]}.json", 'wb') as outfile:
translate = h5chunks.translate()
outfile.write(ujson.dumps(translate).encode())
#references.append(translate)Create a list with the paths to the reference files
fs_ref_list = fsspec.filesystem('file')reference_list = sorted([x for x in fs_ref_list.ls('jsons') if '.json' in x])
reference_listOpen the first reference file to read into an xarray dataset
with open(reference_list[0]) as j:
reference = ujson.load(j)Set configurations options
s_opts = {'skip_instance_cache':True} #json
r_opts = {'anon':False,
'key':temp_creds_req['accessKeyId'],
'secret':temp_creds_req['secretAccessKey'],
'token':temp_creds_req['sessionToken']} #ncfilesfs_single = fsspec.filesystem("reference",
fo=reference,
ref_storage_args=s_opts,
remote_protocol='s3',
remote_options=r_opts)Read in a single reference object. Get lots of SerializationWarnings. Ignored using the warning package, but the fill value, data range, min value, and max value DO NOT match the source file.
%%time
m = fs_single.get_mapper("")
ds_single = xr.open_dataset(m, engine="zarr", backend_kwargs={'consolidated':False}, chunks={})
ds_single#ds_singleCombine the single reference files created above into a single time series reference file THIS DID NOT WORK
# mzz = MultiZarrToZarr(
# reference_list,
# remote_protocol="s3",
# remote_options=r_opts,
# concat_dims=["time"]
# )
# out = mzz.translate()# fs_mzz = fsspec.filesystem("reference",
# fo=out,
# ref_storage_args=s_opts,
# remote_protocol='s3',
# remote_options=r_opts)# m = fs_mzz.get_mapper("")
# ds_multi = xr.open_dataset(m, engine="zarr", backend_kwargs={'consolidated':False}, chunks={})Combine the single reference files created above into a single time series reference file THIS DID WORK
%%time
ds_k =[]
for ref in reference_list:
s_opts = s_opts
r_opts = r_opts
fs = fsspec.filesystem(
"reference",
fo=ref,
ref_storage_args=s_opts,
remote_protocol='s3',
remote_options=r_opts)
m = fs.get_mapper("")
ds_k.append(xr.open_dataset(m, engine="zarr", backend_kwargs={'consolidated':False}, chunks={}))
ds_multi = xr.concat(ds_k, dim='time')
ds_multiAgains, the fill value, data range, min value, and max value DO NOT match the source file
#ds_multi = xr.concat(ds_k, dim='time')#ds_multids_multi['T500']# Commenting for quarto site render
# ds_multi['T500'].hvplot.image(x='lon', y='lat')