Graph Machine Learning
AnemoI
English
aifs-single-1.0 / config_finetuning.yaml
anaprietonem's picture
Update config_finetuning.yaml
8c9e0f6 verified
data:
format: zarr
resolution: n320
frequency: 6h
timestep: 6h
forcing:
- cos_latitude
- cos_longitude
- sin_latitude
- sin_longitude
- cos_julian_day
- cos_local_time
- sin_julian_day
- sin_local_time
- insolation
- lsm
- sdor
- slor
- z
diagnostic:
- tp
- cp
- sf
- tcc
- hcc
- lcc
- mcc
- ro
- ssrd
- strd
- 100u
- 100v
remapped: null
normalizer:
default: mean-std
remap:
cp: tp
sf: tp
std:
- tp
- cp
- sf
- ro
- tcw
- ssrd
- q_50
- q_100
- q_150
- q_200
- q_250
- q_300
- q_400
- q_500
- q_600
- q_700
- q_850
- q_925
- q_1000
min-max: null
max:
- sdor
- slor
- z
none:
- cos_latitude
- cos_longitude
- sin_latitude
- sin_longitude
- cos_julian_day
- cos_local_time
- sin_julian_day
- sin_local_time
- insolation
- lsm
- tcc
- mcc
- hcc
- lcc
- swvl1
- swvl2
imputer:
default: none
remapper:
default: none
processors:
normalizer:
_target_: anemoi.models.preprocessing.normalizer.InputNormalizer
_convert_: all
config:
default: mean-std
remap:
cp: tp
sf: tp
std:
- tp
- cp
- sf
- ro
- tcw
- ssrd
- q_50
- q_100
- q_150
- q_200
- q_250
- q_300
- q_400
- q_500
- q_600
- q_700
- q_850
- q_925
- q_1000
min-max: null
max:
- sdor
- slor
- z
none:
- cos_latitude
- cos_longitude
- sin_latitude
- sin_longitude
- cos_julian_day
- cos_local_time
- sin_julian_day
- sin_local_time
- insolation
- lsm
- tcc
- mcc
- hcc
- lcc
- swvl1
- swvl2
num_features: 115
dataloader:
prefetch_factor: 2
pin_memory: True
read_group_size: 4
num_workers:
training: 8
validation: 8
test: 8
predict: 8
batch_size:
training: 1
validation: 1
test: 4
predict: 4
limit_batches:
training: 1000
validation: 10
test: 20
predict: 20
dataset: ${hardware.paths.data}/${hardware.files.dataset}
land_dataset: ${hardware.paths.data}/${hardware.files.dataset_land}
land_variables: [100u, 100v, swvl1, swvl2, stl1, stl2, tcc, lcc, mcc, hcc, sf, ro, strd, ssrd]
training:
dataset:
- dataset: ${dataloader.dataset}
start: null
end: 2022
frequency: ${data.frequency}
drop: []
- dataset: ${dataloader.land_dataset}
start: null
end: 2022
frequency: ${data.frequency}
select: ${dataloader.land_variables}
start: null
end: 2022
drop: []
validation:
dataset:
- dataset: ${dataloader.dataset}
start: 2022
end: 2022
frequency: ${data.frequency}
drop: []
- dataset: ${dataloader.land_dataset}
start: 2022
end: 2022
frequency: ${data.frequency}
select: ${dataloader.land_variables}
start: 2022
end: 2022
drop: []
validation_rollout: 1
diagnostics:
plot:
asynchronous: False
datashader: True
frequency:
batch: 750
epoch: 10
parameters: [tp]
sample_idx: 0
precip_and_related_fields: [tp, cp]
callbacks: []
enabled: True
scatter: False
mode: asyncio
callbacks: {}
benchmark_profiler:
memory:
enabled: True
steps: 5
warmup: 2
extra_plots: False
trace_rank0_only: False
time:
enabled: True
verbose: False
speed:
enabled: True
system:
enabled: True
model_summary:
enabled: True
snapshot:
enabled: True
steps: 4
warmup: 0
debug:
anomaly_detection: False
profiler: False
enable_checkpointing: True
checkpoint:
every_n_minutes:
save_frequency: 30
num_models_saved: 3
every_n_epochs:
save_frequency: 1
num_models_saved: 3
every_n_train_steps:
save_frequency: null
num_models_saved: 0
log:
wandb:
enabled: False
tensorboard:
enabled: False
mlflow:
enabled: False
interval: 100
enable_progress_bar: True
print_memory_summary: False
hardware:
paths:
data: ${oc.decode:${oc.env:DATASETS_PATH}}
output: ${oc.decode:${oc.env:OUTPUT_DIR}}
logs:
base: ${hardware.paths.output}/logs
wandb: ${hardware.paths.output}/logs/wandb
mlflow: ${hardware.paths.output}/logs/mlflow
tensorboard: ${hardware.paths.output}/logs/tensorboard
checkpoints: ${hardware.paths.output}/checkpoint/
plots: ${hardware.paths.output}/plots/
profiler: ${hardware.paths.output}/profiler/
graph: ${hardware.paths.output}/graphs/
files:
dataset: aifs-od-an-oper-0001-mars-n320-2016-2023-6h-v6.zarr
dataset_land: aifs-od-an-oper-0001-mars-n320-2016-2023-6h-v1-land.zarr
graph: graph_enc_proc_dec_n320.pt
checkpoint:
every_n_epochs: aifs-by_epoch-epoch_{epoch:03d}-val_wmse_{val_wmse:.3e}
every_n_train_steps: aifs-by_step-epoch_{epoch:03d}-step_{step:06d}
every_n_minutes: aifs-by_time-epoch_{epoch:03d}-step_{step:06d}
warm_start: null
accelerator: auto
num_gpus_per_node: 4
num_nodes: 16
num_gpus_per_model: 4
graph:
overwrite: True
data: data
hidden: hidden
nodes:
data:
node_builder:
_target_: anemoi.graphs.nodes.ZarrDatasetNodes
dataset: ${dataloader.dataset}
attributes:
area_weight:
_target_: anemoi.graphs.nodes.attributes.AreaWeights
norm: unit-max
hidden:
node_builder:
_target_: anemoi.graphs.nodes.ReducedGaussianGridNodes
grid: o96
edges:
- source_name: data
target_name: hidden
edge_builder:
_target_: anemoi.graphs.edges.CutOffEdges
cutoff_factor: 0.6
attributes:
edge_length:
_target_: anemoi.graphs.edges.attributes.EdgeLength
norm: unit-std
edge_dirs:
_target_: anemoi.graphs.edges.attributes.EdgeDirection
norm: unit-std
- source_name: hidden
target_name: data
edge_builder:
_target_: anemoi.graphs.edges.KNNEdges
num_nearest_neighbours: 3
attributes:
edge_length:
_target_: anemoi.graphs.edges.attributes.EdgeLength
norm: unit-std
edge_dirs:
_target_: anemoi.graphs.edges.attributes.EdgeDirection
norm: unit-std
attributes:
nodes:
area_weight:
_target_: anemoi.graphs.nodes.attributes.AreaWeights
norm: unit-max
edges:
edge_length:
_target_: anemoi.graphs.edges.attributes.EdgeLength
norm: unit-std
edge_dirs:
_target_: anemoi.graphs.edges.attributes.EdgeDirection
norm: unit-std
model:
activation: GELU
num_channels: 1024
model:
_target_: anemoi.models.models.encoder_processor_decoder.AnemoiModelEncProcDec
processor:
_target_: anemoi.models.layers.processor.TransformerProcessor
_convert_: all
activation: GELU
num_layers: 16
num_chunks: 2
mlp_hidden_ratio: 4
num_heads: 16
window_size: 1120
dropout_p: 0.0
encoder:
_target_: anemoi.models.layers.mapper.GraphTransformerForwardMapper
_convert_: all
trainable_size: 8
sub_graph_edge_attributes: [edge_length, edge_dirs]
activation: GELU
num_chunks: 1
mlp_hidden_ratio: 4
num_heads: 16
decoder:
_target_: anemoi.models.layers.mapper.GraphTransformerBackwardMapper
_convert_: all
trainable_size: 8
sub_graph_edge_attributes: [edge_length, edge_dirs]
activation: GELU
num_chunks: 1
mlp_hidden_ratio: 4
num_heads: 16
trainable_parameters:
data: 8
hidden: 8
data2hidden: 8
hidden2data: 8
attributes:
edges: [edge_length, edge_dirs]
nodes: []
node_loss_weight: area_weight
bounding:
- _target_: anemoi.models.layers.bounding.ReluBounding
variables:
- tp
- ro
- tcw
- ssrd
- q_50
- q_100
- q_150
- q_200
- q_250
- q_300
- q_400
- q_500
- q_600
- q_700
- q_850
- q_925
- q_1000
- _target_: anemoi.models.layers.bounding.HardtanhBounding
variables: [tcc, swvl1, swvl2]
min_val: 0
max_val: 1
- _target_: anemoi.models.layers.bounding.FractionBounding
variables: [cp, sf]
min_val: 0
max_val: 1
total_var: tp
- _target_: anemoi.models.layers.bounding.FractionBounding
variables: [lcc, mcc, hcc]
min_val: 0
max_val: 1
total_var: tcc
training:
run_id: null
fork_run_id: ${oc.decode:${oc.env:PRETRAINING_RUN_ID}}
load_weights_only: True
deterministic: False
precision: 16-mixed
multistep_input: 2
accum_grad_batches: 1
num_sanity_val_steps: 6
gradient_clip:
val: 32.0
algorithm: value
swa:
enabled: False
lr: 0.0001
zero_optimizer: False
training_loss:
_target_: anemoi.training.losses.mse.WeightedMSELoss
scalars:
- variable
- loss_weights_mask
ignore_nans: False
loss_gradient_scaling: False
validation_metrics:
- _target_: anemoi.training.losses.mse.WeightedMSELoss
scalars: []
ignore_nans: True
rollout:
start: 1
epoch_increment: 1
max: 12
max_epochs: 13
max_steps: 150000
lr:
rate: 8.0e-7
iterations: 7900
min: 3.0e-7
warmup_t: 100
variable_loss_scaling:
default: 1
pl:
q: 0.6
t: 6
u: 0.8
v: 0.5
w: 0.001
z: 12
sfc:
sp: 10
10u: 0.5
10v: 0.5
100u: 0.1
100v: 0.1
2d: 0.5
tp: 0.025
cp: 0.0025
ro: 0.005
sf: 0.025
tcc: 0.1
mcc: 0.1
lcc: 0.1
hcc: 0.1
swvl2: 200
swvl1: 100
stl2: 10
stl1: 1
ssrd: 0.05
strd: 0.1
metrics: [z_500, t_850, u_850, v_850]
pressure_level_scaler:
_target_: anemoi.training.data.scaling.ReluPressureLevelScaler
minimum: 0.2
slope: 0.001