E3SM-Project · mahf708 · Oct 16, 2025 · Oct 16, 2025 · Oct 16, 2025 · Oct 16, 2025
diff --git a/fme/ace/aggregator/inference/enso/dynamic_index.py b/fme/ace/aggregator/inference/enso/dynamic_index.py
@@ -48,6 +48,9 @@ def __post_init__(self):
             torch.logical_and(lat_mask, lon_mask), 1.0, 0.0
         )
 
+        dist = Distributed.get_instance()
+        self._regional_weights = self._regional_weights[*dist.get_local_slices(self._regional_weights.shape)]
+
     @property
     def regional_weights(self) -> torch.Tensor:
         return self._regional_weights

diff --git a/fme/ace/aggregator/inference/main.py b/fme/ace/aggregator/inference/main.py
@@ -35,6 +35,7 @@
 from .time_mean import TimeMeanAggregator, TimeMeanEvaluatorAggregator
 from .video import VideoAggregator
 from .zonal_mean import ZonalMeanAggregator
+from fme.core.distributed import Distributed
 
 wandb = WandB.get_instance()
 APPROXIMATELY_TWO_YEARS = datetime.timedelta(days=730)
@@ -157,12 +158,23 @@ def build(
             monthly_reference_data = xr.open_dataset(
                 self.monthly_reference_data, decode_timedelta=False
             )
+            dist = Distributed.get_instance()
+            # CHECK: Is there another way to get lat_length and lon_length?
+            # Should we move this splitting operation inside the InferenceEvaluatorAggregator?
+            lat_length = len(monthly_reference_data.coords['lat'])
+            lon_length = len(monthly_reference_data.coords['lon'])
+            crop_shape = (lat_length, lon_length)
+            slice_h, slice_w = dist.get_local_slices(crop_shape)
+            monthly_reference_data = monthly_reference_data.isel(lat=slice_h, lon=slice_w)
+
         if self.time_mean_reference_data is None:
             time_mean = None
         else:
             time_mean = xr.open_dataset(
                 self.time_mean_reference_data, decode_timedelta=False
             )
+
+
         return InferenceEvaluatorAggregator(
             dataset_info=dataset_info,
             n_timesteps=n_timesteps,

diff --git a/fme/ace/aggregator/one_step/snapshot.py b/fme/ace/aggregator/one_step/snapshot.py
@@ -9,7 +9,7 @@
 
 from ..plotting import plot_paneled_data
 
-
+from fme.core.distributed import Distributed
 class SnapshotAggregator:
     """
     An aggregator that records the first sample of the last batch of data.
@@ -65,23 +65,22 @@ def _get_data(self) -> tuple[TensorMapping, TensorMapping, TensorMapping]:
         input_time = 0
         target_time = 1
         gen, target, input = {}, {}, {}
+        dist = Distributed.get_instance()
         for name in self._gen_data.keys():
             # use first sample in batch
-            gen[name] = (
-                self._gen_data[name]
-                .select(dim=time_dim, index=target_time)[0]
-                .cpu()
-                .numpy()
-            )
+            gen_data_local=self._gen_data[name].select(dim=time_dim, index=target_time)[0]
+            gen_data = dist.gather_spatial_distributed(gen_data_local)
+            gen[name] = (gen_data.cpu().numpy())
+
+            target_local=self._target_data[name].select(dim=time_dim, index=target_time)[0]
+            target_data = dist.gather_spatial_distributed(target_local)
             target[name] = (
-                self._target_data[name]
-                .select(dim=time_dim, index=target_time)[0]
+                target_data
                 .cpu()
                 .numpy()
             )
             input[name] = (
-                self._target_data[name]
-                .select(dim=time_dim, index=input_time)[0]
+                target_data
                 .cpu()
                 .numpy()
             )

diff --git a/fme/ace/aggregator/one_step/test_reduced_sp.py b/fme/ace/aggregator/one_step/test_reduced_sp.py
@@ -0,0 +1,97 @@
+import numpy as np
+import pytest
+import torch
+import os
+from fme.ace.aggregator.one_step.reduced import MeanAggregator
+from fme.core.device import get_device
+from fme.core.gridded_ops import LatLonOperations
+
+from fme.core.distributed import Distributed
+
+
+def test_loss_wo_sp():
+    """
+    Basic test the aggregator combines loss correctly
+    with multiple batches and no distributed training.
+    """
+    nx=8
+    ny=8
+    torch.manual_seed(0)
+    example_data = {
+        "a": torch.randn(1, 2, nx, ny, device=get_device()),
+    }
+    area_weights = torch.ones(nx,ny).to(get_device())
+    aggregator = MeanAggregator(LatLonOperations(area_weights))
+    aggregator.record_batch(
+        loss=1.0,
+        target_data=example_data,
+        gen_data=example_data,
+        target_data_norm=example_data,
+        gen_data_norm=example_data,
+    )
+    aggregator.record_batch(
+        loss=2.0,
+        target_data=example_data,
+        gen_data=example_data,
+        target_data_norm=example_data,
+        gen_data_norm=example_data,
+    )
+    logs = aggregator.get_logs(label="metrics")
+    print("lost", logs["metrics/loss"] )
+    assert logs["metrics/loss"] == 1.5
+    aggregator.record_batch(
+        loss=3.0,
+        target_data=example_data,
+        gen_data=example_data,
+        target_data_norm=example_data,
+        gen_data_norm=example_data,
+    )
+    logs = aggregator.get_logs(label="metrics")
+    print("lost", logs["metrics/loss"] )
+    assert logs["metrics/loss"] == 2.0
+
+def test_loss_with_sp():
+    os.environ['H_PARALLEL_SIZE'] = '2'
+    os.environ['W_PARALLEL_SIZE'] = '2'
+    nx=8
+    ny=8
+    torch.manual_seed(0)
+    tensor_data_host=torch.randn(1, 2, nx, ny)
+    area_weights = torch.ones(nx,ny)
+    aggregator = MeanAggregator(LatLonOperations(area_weights))
+    dist = Distributed.get_instance()
+    this_shape=(tensor_data_host.shape[-2],tensor_data_host.shape[-1])
+    tensor_data_local_host = (tensor_data_host[:,:,*dist.get_local_slices(this_shape)]).detach().clone()
+    tensor_data_local=tensor_data_local_host.to(dist.local_rank)
+
+    example_data = {
+        "a": tensor_data_local
+    }
+
+    aggregator.record_batch(
+        loss=1.0,
+        target_data=example_data,
+        gen_data=example_data,
+        target_data_norm=example_data,
+        gen_data_norm=example_data,
+    )
+    aggregator.record_batch(
+        loss=2.0,
+        target_data=example_data,
+        gen_data=example_data,
+        target_data_norm=example_data,
+        gen_data_norm=example_data,
+    )
+    logs = aggregator.get_logs(label="metrics")
+    print("lost", logs["metrics/loss"] )
+    assert logs["metrics/loss"] == 1.5
+    aggregator.record_batch(
+        loss=3.0,
+        target_data=example_data,
+        gen_data=example_data,
+        target_data_norm=example_data,
+        gen_data_norm=example_data,
+    )
+    logs = aggregator.get_logs(label="metrics")
+    print("lost", logs["metrics/loss"] )
+    assert logs["metrics/loss"] == 2.0
diff --git a/fme/ace/data_loading/config.py b/fme/ace/data_loading/config.py
@@ -82,11 +82,7 @@ def get_dataset(
 
     def __post_init__(self):
         dist = Distributed.get_instance()
-        if self.batch_size % dist.world_size != 0:
-            raise ValueError(
-                "batch_size must be divisible by the number of parallel "
-                f"workers, got {self.batch_size} and {dist.world_size}"
-            )
+        dist.check_local_batch_size(self.batch_size)
         # TODO: remove following backwards compatibility code in a future release
         if isinstance(self.dataset, Sequence):
             warnings.warn(

diff --git a/fme/ace/models/makani_models/helpers.py b/fme/ace/models/makani_models/helpers.py
@@ -0,0 +1,70 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed as dist
+
+# from makani.utils import comm
+from fme.ace.utils import comm
+
+
+def count_parameters(model, device):
+    """Counts model parameters"""
+
+    with torch.no_grad():
+        total_stats = torch.zeros(2, dtype=torch.long, device=device)
+        local_bytes = 0
+        for p in model.parameters():
+            if not p.requires_grad:
+                continue
+
+            # make sure complex weight tensors are accounted for correctly
+            pview = torch.view_as_real(p) if p.is_complex() else p
+            pstats = torch.tensor([pview.numel(), pview.nbytes], dtype=torch.long, device=device)
+            local_bytes += pview.nbytes
+
+            # if the weight is split, then we need to reduce
+            if hasattr(p, "sharded_dims_mp"):
+                for group in p.sharded_dims_mp:
+                    if (group is not None) and (comm.get_size(group) > 1):
+                        dist.all_reduce(pstats, group=comm.get_group(group))
+
+            # sum the total stats
+            total_stats += pstats
+
+    # transfer to cpu
+    total_stats_arr = total_stats.cpu().numpy()
+    total_count = total_stats_arr[0]
+    total_bytes = total_stats_arr[1]
+
+    return total_count, total_bytes, local_bytes
+
+
+def compare_model_parameters(model1, model2):
+    """Checks whether both models have the same parameters"""
+
+    for p1, p2 in zip(model1.parameters(), model2.parameters()):
+        if p1.data.ne(p2.data).any():
+            return False
+    return True
+
+
+def check_parameters(model):
+    """Prints shapes, strides and whether parameters are contiguous"""
+    for p in model.parameters():
+        if p.requires_grad:
+            print(p.shape, p.stride(), p.is_contiguous())
+
+    return