!mamba install kerchunk -y --quiet
LP DAAC ECOSTRESS
example with ECOSTRESS LSTE Kerchunk Reference File
!mamba install h5py=3.2 -y --quiet # Default version in this environment does not work. Must update
import numpy as np
import xarray as xr
import matplotlib.pyplot as plt
import s3fs
import datetime as dt
import logging
import fsspec
import ujson
import requests
from tqdm import tqdm
from glob import glob
import os
import hvplot.xarray
# The xarray produced from the reference file throws a SerializationWarning for each variable. Will need to explore why
#import warnings
#warnings.simplefilter("ignore")
from kerchunk.hdf import SingleHdf5ToZarr
from kerchunk.combine import MultiZarrToZarr
= {
s3_cred_endpoint 'podaac':'https://archive.podaac.earthdata.nasa.gov/s3credentials',
'lpdaac':'https://data.lpdaac.earthdatacloud.nasa.gov/s3credentials',
'ornldaac':'https://data.ornldaac.earthdata.nasa.gov/s3credentials',
'gesdisc':'https://data.gesdisc.earthdata.nasa.gov/s3credentials'
}
def get_temp_creds():
= s3_cred_endpoint['lpdaac']
temp_creds_url return requests.get(temp_creds_url).json()
= get_temp_creds() temp_creds_req
Direct Access a Single file
= s3fs.S3FileSystem(
fs =False,
anon=temp_creds_req['accessKeyId'],
key=temp_creds_req['secretAccessKey'],
secret=temp_creds_req['sessionToken']
token )
= 's3://lp-prod-protected/ECO_L1B_GEO.002/ECOv002_L1B_GEO_21547_021_20220424T215449_0700_01/ECOv002_L1B_GEO_21547_021_20220424T215449_0700_01.h5' url
= fs.open(url, mode='rb') s3_file_obj
import h5py
= h5py.File(s3_file_obj) h5_file
#list(h5_file['Geolocation']['latitude'])
%%time
= xr.open_dataset(s3_file_obj, chunks='auto', engine='h5netcdf', backend_kwargs={"mask_and_scale" : False, "decode_times" : False, "decode_timedelta" : False, "use_cftime" : False, "decode_coords" : False})
xr_ds xr_ds
CPU times: user 134 ms, sys: 35 ms, total: 169 ms
Wall time: 534 ms
<xarray.Dataset> Dimensions: () Data variables: *empty*
Specify a list of S3 URLs (MERRA2, 05/01/2019 - 05/31/2019)
= ['s3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190501.nc4',
urls 's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190502.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190503.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190504.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190505.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190506.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190507.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190508.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190509.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190510.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190511.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190512.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190513.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190514.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190515.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190516.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190517.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190518.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190519.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190520.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190521.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190522.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190523.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190524.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190525.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190526.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190527.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190528.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190529.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190530.nc4',
's3://gesdisc-cumulus-prod-protected/MERRA2/M2T1NXSLV.5.12.4/2019/05/MERRA2_400.tavg1_2d_slv_Nx.20190531.nc4']
Generate the reference kerchunk files. These files can take a little time to generate. Do not run if JSON files already exist in ./jsons/ directory.
= dict(
so ="rb",
mode=False,
anon=False,
default_fill_cache="none"
default_cache_type
)
#references = []
for u in urls:
with fs.open(u, **so) as infile:
= SingleHdf5ToZarr(infile, u, inline_threshold=300)
h5chunks with open(f"jsons/{u.split('/')[-1]}.json", 'wb') as outfile:
= h5chunks.translate()
translate
outfile.write(ujson.dumps(translate).encode())#references.append(translate)
Create a list with the paths to the reference files
= fsspec.filesystem('file') fs_ref_list
= sorted([x for x in fs_ref_list.ls('jsons') if '.json' in x])
reference_list reference_list
Open the first reference file to read into an xarray dataset
with open(reference_list[0]) as j:
= ujson.load(j) reference
Set configurations options
= {'skip_instance_cache':True} #json
s_opts = {'anon':False,
r_opts 'key':temp_creds_req['accessKeyId'],
'secret':temp_creds_req['secretAccessKey'],
'token':temp_creds_req['sessionToken']} #ncfiles
= fsspec.filesystem("reference",
fs_single =reference,
fo=s_opts,
ref_storage_args='s3',
remote_protocol=r_opts) remote_options
Read in a single reference object. Get lots of SerializationWarnings
. Ignored using the warning
package, but the fill value
, data range
, min value
, and max value
DO NOT match the source file.
%%time
= fs_single.get_mapper("")
m = xr.open_dataset(m, engine="zarr", backend_kwargs={'consolidated':False}, chunks={})
ds_single ds_single
#ds_single
Combine the single reference files created above into a single time series reference file THIS DID NOT WORK
# mzz = MultiZarrToZarr(
# reference_list,
# remote_protocol="s3",
# remote_options=r_opts,
# concat_dims=["time"]
# )
# out = mzz.translate()
# fs_mzz = fsspec.filesystem("reference",
# fo=out,
# ref_storage_args=s_opts,
# remote_protocol='s3',
# remote_options=r_opts)
# m = fs_mzz.get_mapper("")
# ds_multi = xr.open_dataset(m, engine="zarr", backend_kwargs={'consolidated':False}, chunks={})
Combine the single reference files created above into a single time series reference file THIS DID WORK
%%time
=[]
ds_k
for ref in reference_list:
= s_opts
s_opts = r_opts
r_opts = fsspec.filesystem(
fs "reference",
=ref,
fo=s_opts,
ref_storage_args='s3',
remote_protocol=r_opts)
remote_options= fs.get_mapper("")
m ="zarr", backend_kwargs={'consolidated':False}, chunks={}))
ds_k.append(xr.open_dataset(m, engine
= xr.concat(ds_k, dim='time')
ds_multi
ds_multi
Agains, the fill value
, data range
, min value
, and max value
DO NOT match the source file
#ds_multi = xr.concat(ds_k, dim='time')
#ds_multi
'T500'] ds_multi[
# Commenting for quarto site render
# ds_multi['T500'].hvplot.image(x='lon', y='lat')