Skip to content

Upgrade notebooks to v0.3.0 of the online lab #36

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 30 commits into from
Apr 25, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
6f21d85
Delete utils.py
juntyr Apr 3, 2025
cc091ae
Delete 01-intro.ipynb
juntyr Apr 3, 2025
412f317
Add files via upload
juntyr Apr 3, 2025
7f1ab9a
Rename 01-intro(2)(5)(4).ipynb to 01-intro.ipynb
juntyr Apr 3, 2025
0af3ea1
Rename utils(5).py to utils.py
juntyr Apr 3, 2025
2644ecd
Delete 02-data-sources/02-remote.ipynb
juntyr Apr 3, 2025
f6bb5c9
Add files via upload
juntyr Apr 3, 2025
5d378ce
Rename 02-remote(5).ipynb to 02-remote.ipynb
juntyr Apr 3, 2025
52d487a
Delete 02-data-sources/01-local.ipynb
juntyr Apr 3, 2025
807a794
Delete 02-data-sources/03-cdsapi.ipynb
juntyr Apr 3, 2025
1db6cdf
Delete 02-data-sources/04-ecmwfapi.ipynb
juntyr Apr 3, 2025
2c93dd6
Add files via upload
juntyr Apr 3, 2025
eb234e7
Rename 01-local(2).ipynb to 01-local.ipynb
juntyr Apr 3, 2025
1568df5
Rename 03-cdsapi(5).ipynb to 03-cdsapi.ipynb
juntyr Apr 3, 2025
41a460d
Rename 04-ecmwfapi(1).ipynb to 04-ecmwfapi.ipynb
juntyr Apr 3, 2025
49dd559
Delete 03-examples/01-compressors.ipynb
juntyr Apr 3, 2025
c6dd1a1
Add files via upload
juntyr Apr 3, 2025
de3a142
Rename 01-compressors(5).ipynb to 01-compressors.ipynb
juntyr Apr 3, 2025
ddabcd4
Delete 03-examples/01-compressors.ipynb
juntyr Apr 3, 2025
bdddb99
Add files via upload
juntyr Apr 3, 2025
db3e73a
Update utils.py
juntyr Apr 3, 2025
89c81f9
Delete 04-example-datasets/01-hplp.ipynb
juntyr Apr 3, 2025
65d4016
Add files via upload
juntyr Apr 3, 2025
5a578e8
Delete 04-example-datasets/02-OpenIFS.ipynb
juntyr Apr 3, 2025
3fe09c2
Add files via upload
juntyr Apr 3, 2025
1fa05af
Delete 04-example-datasets/03-NextGEMS.ipynb
juntyr Apr 3, 2025
40b5e28
Add files via upload
juntyr Apr 3, 2025
a62999a
Delete 04-example-datasets/04-ICONXPP.ipynb
juntyr Apr 3, 2025
8b3fd52
Add files via upload
juntyr Apr 3, 2025
e42505f
Upgrade notebooks to v0.3.0 of the online lab
juntyr Apr 25, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 68 additions & 60 deletions 01-intro.ipynb

Large diffs are not rendered by default.

14 changes: 7 additions & 7 deletions 02-data-sources/01-local.ipynb

Large diffs are not rendered by default.

78 changes: 49 additions & 29 deletions 02-data-sources/02-remote.ipynb

Large diffs are not rendered by default.

32 changes: 16 additions & 16 deletions 02-data-sources/03-cdsapi.ipynb

Large diffs are not rendered by default.

37 changes: 21 additions & 16 deletions 02-data-sources/04-ecmwfapi.ipynb

Large diffs are not rendered by default.

136 changes: 70 additions & 66 deletions 03-examples/01-compressors.ipynb

Large diffs are not rendered by default.

99 changes: 62 additions & 37 deletions 04-example-datasets/01-hplp.ipynb

Large diffs are not rendered by default.

57 changes: 31 additions & 26 deletions 04-example-datasets/02-OpenIFS.ipynb

Large diffs are not rendered by default.

26 changes: 13 additions & 13 deletions 04-example-datasets/03-NextGEMS.ipynb

Large diffs are not rendered by default.

63 changes: 34 additions & 29 deletions 04-example-datasets/04-ICONXPP.ipynb

Large diffs are not rendered by default.

149 changes: 101 additions & 48 deletions utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from collections.abc import Sequence
from contextlib import asynccontextmanager
from pathlib import Path
from typing import Optional, Union
Expand Down Expand Up @@ -65,11 +66,10 @@ def _get_name_from_url(url: str) -> str:
async def download_dataset_as_zarr(
ds: "xarray.Dataset",
name: str,
compressor: Union[
"numcodecs.abc.Codec",
list["numcodecs.abc.Codec"],
dict[str, Union["numcodecs.abc.Codec", list["numcodecs.abc.Codec"]]],
],
*,
filters: None | dict[str, "None | zarr.abc.codec.ArrayArrayCodec"],
serializer: "None | zarr.abc.codec.ArrayBytesCodec",
compressors: None | dict[str, "None | zarr.abc.codec.BytesBytesCodec"],
zip_compression: int = 0,
):
import ipyfilite
Expand All @@ -96,31 +96,36 @@ async def download_dataset_as_zarr(
mode="x",
)

filters = (
filters
if isinstance(filters, dict)
else {var: filters for var in ds}
)
serializer = (
serializer
if isinstance(serializer, dict)
else {var: serializer for var in ds}
)
compressors = (
compressor
if isinstance(compressor, dict)
else {var: compressor for var in ds}
compressors
if isinstance(compressors, dict)
else {var: compressors for var in ds}
)

encoding = dict()
for var, compressor in compressors.items():
if isinstance(compressor, list):
if len(compressor) == 0:
continue
encoding[var] = dict(
compressor=compressor[0],
filters=compressor[1:],
)
else:
encoding[var] = dict(
compressor=compressor,
filters=[],
)
for var in ds:
encoding[var] = dict(
filters=filters[var],
compressors=compressors[var],
**({
"serializer": serializer[var]
} if serializer[var] is not None else {})
)

ds.to_zarr(store=store, mode="w-", encoding=encoding)

for key in store.keys():
chunk_store[key] = store[key]
async for key in store.list():
await chunk_store.set(key, await store.get(key, zarr.core.buffer.core.default_buffer_prototype()))

store.close()
chunk_store.close()
Expand All @@ -137,45 +142,93 @@ async def file_download_path(name: str) -> Path:
pass


def format_compress_stats(
codecs: list["numcodecs.abc.Codec"],
stats: list["fcbench.compressor.types.CodecPerformanceMeasurement"],
def format_compression_metrics(
codecs: Sequence["numcodecs.abc.Codec"],
*,
nbytes: "numcodecs_observers.bytesize.BytesizeObserver",
instructions: "None | numcodecs_wasm.WasmCodecInstructionCounterObserver" = None,
timings: "None | numcodecs_observers.walltime.WalltimeObserver" = None,
):
import pandas as pd
from numcodecs_observers.hash import HashableCodec

codecs = tuple(codecs)

encoded_bytes = { c: sum(e.post for e in es) for c, es in nbytes.encode_sizes.items() }
decoded_bytes = { c: sum(d.post for d in ds) for c, ds in nbytes.decode_sizes.items() }

table = pd.DataFrame(
{
"Codec": [],
"compression ratio [raw B / enc B]": [],
"encode throughput [raw GB/s]": [],
"decode throughput [raw GB/s]": [],
"encode instructions [#/B]": [],
"decode instructions [#/B]": [],
"Codec": [str(c) for c in codecs] + ["Summary"],
"compression ratio [raw B / enc B]": [
round(decoded_bytes[HashableCodec(c)] / encoded_bytes[HashableCodec(c)], 2) for c in codecs
] + ([
round(decoded_bytes[HashableCodec(codecs[0])] / encoded_bytes[HashableCodec(codecs[-1])], 2)
] if len(codecs) > 0 else [1.0]),
}
).set_index(["Codec"])

for codec, stat in zip(codecs, stats):
table.loc[str(codec), :] = [
round(stat.decoded_bytes / stat.encoded_bytes, 2),
if instructions is not None:
table["encode instructions [#/B]"] = [
(round(
sum(instructions.encode_instructions[HashableCodec(c)])
/ decoded_bytes[HashableCodec(c)],
1,
) if HashableCodec(c) in instructions.encode_instructions else "<unknown>") for c in codecs
] + ([
round(
sum(sum(instructions.encode_instructions[HashableCodec(c)]) for c in codecs)
/ decoded_bytes[HashableCodec(codecs[0])],
1,
) if all(HashableCodec(c) in instructions.encode_instructions for c in codecs) else "<unknown>"
] if len(codecs) > 0 else [0.0])

table["decode instructions [#/B]"] = [
(round(
sum(instructions.decode_instructions[HashableCodec(c)])
/ decoded_bytes[HashableCodec(c)],
1,
) if HashableCodec(c) in instructions.decode_instructions else "<unknown>") for c in codecs
] + ([
round(
sum(sum(instructions.decode_instructions[HashableCodec(c)]) for c in codecs)
/ decoded_bytes[HashableCodec(codecs[0])],
1,
) if all(HashableCodec(c) in instructions.decode_instructions for c in codecs) else "<unknown>"
] if len(codecs) > 0 else [0.0])

if timings is not None:
table["encode throughput [raw GB/s]"] = [
round(
1e-9
* stat.decoded_bytes
/ (stat.encode_timing.secs + stat.encode_timing.nanos * 1e-9),
* decoded_bytes[HashableCodec(c)]
/ sum(timings.encode_times[HashableCodec(c)]),
2,
),
) for c in codecs
] + ([
round(
1e-9
* decoded_bytes[HashableCodec(codecs[0])]
/ sum(sum(timings.encode_times[HashableCodec(c)]) for c in codecs),
2,
)
] if len(codecs) > 0 else [0.0])

table["decode throughput [raw GB/s]"] = [
round(
1e-9
* stat.decoded_bytes
/ (stat.decode_timing.secs + stat.decode_timing.nanos * 1e-9),
* decoded_bytes[HashableCodec(c)]
/ sum(timings.decode_times[HashableCodec(c)]),
2,
),
round(stat.encode_instructions / stat.decoded_bytes, 1)
if stat.encode_instructions is not None
else None,
round(stat.decode_instructions / stat.decoded_bytes, 1)
if stat.decode_instructions is not None
else None,
]
) for c in codecs
] + ([
round(
1e-9
* decoded_bytes[HashableCodec(codecs[0])]
/ sum(sum(timings.decode_times[HashableCodec(c)]) for c in codecs),
2,
)
] if len(codecs) > 0 else [0.0])

return table

Expand Down