From 0695ba28649c00ebd94ae3564d8eb6637446a990 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Fri, 13 Feb 2026 20:46:52 +0800
Subject: [PATCH 1/2] feat: support natom padding in deepmd/npy/mixed format

---
 dpdata/deepmd/mixed.py     | 209 +++++++++++++++++++++++---
 dpdata/plugins/deepmd.py   |  26 +++-
 tests/test_deepmd_mixed.py | 300 +++++++++++++++++++++++++++++++++++++
 3 files changed, 511 insertions(+), 24 deletions(-)

diff --git a/dpdata/deepmd/mixed.py b/dpdata/deepmd/mixed.py
index 38e8b386b..3be6fae1b 100644
--- a/dpdata/deepmd/mixed.py
+++ b/dpdata/deepmd/mixed.py
@@ -1,15 +1,151 @@
 from __future__ import annotations
 
 import copy
+import math
 
 import numpy as np
 
 import dpdata
+from dpdata.data_type import Axis
 
 from .comp import dump as comp_dump
 from .comp import to_system_data as comp_to_system_data
 
 
+def _pad_to(sys_data, target_natoms, dtypes):
+    """Pad system data dict so that NATOMS dimension becomes target_natoms.
+
+    Virtual atoms get real_atom_types = -1, and all other per-atom data is
+    padded with zeros.
+
+    Parameters
+    ----------
+    sys_data : dict
+        System data dict, already in mixed-type format.
+    target_natoms : int
+        Target number of atoms after padding.
+    dtypes : tuple[DataType, ...]
+        Registered data types to iterate for generic per-atom padding.
+    """
+    natoms = sys_data["atom_types"].shape[0]
+    npad = target_natoms - natoms
+    if npad <= 0:
+        return
+    nframes = sys_data["coords"].shape[0]
+
+    # Pad atom_types (all MIXED_TOKEN = 0)
+    sys_data["atom_types"] = np.concatenate(
+        [sys_data["atom_types"], np.zeros(npad, dtype=int)]
+    )
+    sys_data["atom_numbs"] = [target_natoms]
+
+    # Pad real_atom_types with -1 (virtual atom sentinel)
+    sys_data["real_atom_types"] = np.concatenate(
+        [
+            sys_data["real_atom_types"],
+            -np.ones((nframes, npad), dtype=sys_data["real_atom_types"].dtype),
+        ],
+        axis=1,
+    )
+
+    # Pad coords with zeros
+    sys_data["coords"] = np.concatenate(
+        [
+            sys_data["coords"],
+            np.zeros((nframes, npad, 3), dtype=sys_data["coords"].dtype),
+        ],
+        axis=1,
+    )
+
+    # Pad all other per-atom data generically
+    reserved = {
+        "atom_numbs",
+        "atom_names",
+        "atom_types",
+        "orig",
+        "cells",
+        "coords",
+        "real_atom_names",
+        "real_atom_types",
+        "nopbc",
+    }
+    for dtype in dtypes:
+        if dtype.name in reserved:
+            continue
+        if dtype.name not in sys_data:
+            continue
+        if not (
+            len(dtype.shape) >= 2
+            and dtype.shape[0] == Axis.NFRAMES
+            and Axis.NATOMS in dtype.shape
+        ):
+            continue
+        axis_natoms = list(dtype.shape).index(Axis.NATOMS)
+        arr = sys_data[dtype.name]
+        pad_width = [(0, 0)] * len(arr.shape)
+        pad_width[axis_natoms] = (0, npad)
+        sys_data[dtype.name] = np.pad(
+            arr, pad_width, mode="constant", constant_values=0
+        )
+
+
+def _strip_virtual_atoms(atom_types_row, coords, extra_data, dtypes):
+    """Strip virtual atoms (type -1) from a group of frames.
+
+    Parameters
+    ----------
+    atom_types_row : np.ndarray
+        1-D array of atom type indices for the group (same for all frames).
+    coords : np.ndarray
+        Coordinates array, shape (nframes, natoms_padded, 3).
+    extra_data : dict
+        Dict of {name: array} for this group, arrays already frame-sliced.
+    dtypes : tuple[DataType, ...]
+        Registered data types.
+
+    Returns
+    -------
+    real_mask : np.ndarray
+        Boolean mask of real atoms.
+    atom_types : np.ndarray
+        Atom types with virtual atoms removed.
+    coords : np.ndarray
+        Coords with virtual atoms removed.
+    extra_data : dict
+        Extra data with virtual atoms removed.
+    """
+    real_mask = atom_types_row >= 0
+    if real_mask.all():
+        return atom_types_row, coords, extra_data
+
+    atom_types = atom_types_row[real_mask]
+    coords = coords[:, real_mask, :]
+
+    reserved = {
+        "atom_numbs",
+        "atom_names",
+        "atom_types",
+        "real_atom_names",
+        "real_atom_types",
+        "cells",
+        "coords",
+        "orig",
+        "nopbc",
+    }
+    stripped = {}
+    for name, arr in extra_data.items():
+        for dtype in dtypes:
+            if dtype.name == name and Axis.NATOMS in dtype.shape:
+                axis_natoms = list(dtype.shape).index(Axis.NATOMS)
+                idx = [slice(None)] * len(arr.shape)
+                idx[axis_natoms] = real_mask
+                arr = arr[tuple(idx)]
+                break
+        stripped[name] = arr
+
+    return atom_types, coords, stripped
+
+
 def to_system_data(folder, type_map=None, labels=True):
     data = comp_to_system_data(folder, type_map, labels)
     # data is empty
@@ -26,7 +162,11 @@ def to_system_data(folder, type_map=None, labels=True):
         index_map = None
     all_real_atom_types_concat = data.pop("real_atom_types").astype(int)
     if index_map is not None:
-        all_real_atom_types_concat = index_map[all_real_atom_types_concat]
+        # Preserve -1 (virtual atom sentinel) during remapping
+        valid = all_real_atom_types_concat >= 0
+        remapped = np.full_like(all_real_atom_types_concat, -1)
+        remapped[valid] = index_map[all_real_atom_types_concat[valid]]
+        all_real_atom_types_concat = remapped
     all_cells_concat = data["cells"]
     all_coords_concat = data["coords"]
 
@@ -60,10 +200,6 @@ def to_system_data(folder, type_map=None, labels=True):
     while True:
         if all_real_atom_types_concat.size == 0:
             break
-        temp_atom_numbs = [
-            np.count_nonzero(all_real_atom_types_concat[0] == i)
-            for i in range(len(data["atom_names"]))
-        ]
         # temp_formula = formula(data['atom_names'], temp_atom_numbs)
         temp_idx = np.arange(all_real_atom_types_concat.shape[0])[
             (all_real_atom_types_concat == all_real_atom_types_concat[0]).all(-1)
@@ -71,20 +207,37 @@ def to_system_data(folder, type_map=None, labels=True):
         rest_idx = np.arange(all_real_atom_types_concat.shape[0])[
             (all_real_atom_types_concat != all_real_atom_types_concat[0]).any(-1)
         ]
+
+        # Extract data for this group
+        group_atom_types = all_real_atom_types_concat[0]
+        group_coords = all_coords_concat[temp_idx]
+        group_extra = {}
+        for name in extra_data:
+            group_extra[name] = extra_data[name][temp_idx]
+            extra_data[name] = extra_data[name][rest_idx]
+
+        # Strip virtual atoms (type -1) introduced by padding
+        group_atom_types, group_coords, group_extra = _strip_virtual_atoms(
+            group_atom_types, group_coords, group_extra, dtypes
+        )
+
+        temp_atom_numbs = [
+            np.count_nonzero(group_atom_types == i)
+            for i in range(len(data["atom_names"]))
+        ]
+
         temp_data = data.copy()
         temp_data["atom_names"] = data["atom_names"].copy()
         temp_data["atom_numbs"] = temp_atom_numbs
-        temp_data["atom_types"] = all_real_atom_types_concat[0]
+        temp_data["atom_types"] = group_atom_types
         all_real_atom_types_concat = all_real_atom_types_concat[rest_idx]
         temp_data["cells"] = all_cells_concat[temp_idx]
         all_cells_concat = all_cells_concat[rest_idx]
-        temp_data["coords"] = all_coords_concat[temp_idx]
+        temp_data["coords"] = group_coords
         all_coords_concat = all_coords_concat[rest_idx]
 
-        for name in extra_data:
-            all_dtype_concat = extra_data[name]
-            temp_data[name] = all_dtype_concat[temp_idx]
-            extra_data[name] = all_dtype_concat[rest_idx]
+        for name in group_extra:
+            temp_data[name] = group_extra[name]
 
         data_list.append(temp_data)
     return data_list
@@ -109,7 +262,7 @@ def dump(folder, data, set_size=2000, comp_prec=np.float32, remove_sets=True):
     comp_dump(folder, data, set_size, comp_prec, remove_sets)
 
 
-def mix_system(*system, type_map, **kwargs):
+def mix_system(*system, type_map, atom_numb_pad=None, **kwargs):
     """Mix the systems into mixed_type ones according to the unified given type_map.
 
     Parameters
@@ -118,6 +271,11 @@ def mix_system(*system, type_map, **kwargs):
         The systems to mix
     type_map : list of str
         Maps atom type to name
+    atom_numb_pad : int, optional
+        If provided, pad atom counts to the next multiple of this number
+        using virtual atoms (type -1 in real_atom_types). This reduces the
+        number of subdirectories when systems have many different atom counts.
+        For example, atom_numb_pad=8 groups systems into multiples of 8.
     **kwargs : dict
         Other parameters
 
@@ -129,21 +287,28 @@ def mix_system(*system, type_map, **kwargs):
     mixed_systems = {}
     temp_systems = {}
     atom_numbs_frame_index = {}  # index of frames in cur sys
+    # Use LabeledSystem DTYPES as superset for generic per-atom padding
+    dtypes = dpdata.system.LabeledSystem.DTYPES
     for sys in system:
         tmp_sys = sys.copy()
         natom = tmp_sys.get_natoms()
         tmp_sys.convert_to_mixed_type(type_map=type_map)
-        if str(natom) not in atom_numbs_frame_index:
-            atom_numbs_frame_index[str(natom)] = 0
-        atom_numbs_frame_index[str(natom)] += tmp_sys.get_nframes()
-        if str(natom) not in temp_systems or not temp_systems[str(natom)]:
-            temp_systems[str(natom)] = tmp_sys
+        if atom_numb_pad is not None and atom_numb_pad > 1:
+            padded_natom = math.ceil(natom / atom_numb_pad) * atom_numb_pad
+            _pad_to(tmp_sys.data, padded_natom, dtypes)
+            group_key = str(padded_natom)
+        else:
+            group_key = str(natom)
+        if group_key not in atom_numbs_frame_index:
+            atom_numbs_frame_index[group_key] = 0
+        atom_numbs_frame_index[group_key] += tmp_sys.get_nframes()
+        if group_key not in temp_systems or not temp_systems[group_key]:
+            temp_systems[group_key] = tmp_sys
         else:
-            temp_systems[str(natom)].append(tmp_sys)
-    for natom in temp_systems:
-        if atom_numbs_frame_index[natom] > 0:
-            sys_name = f"{natom}"
-            mixed_systems[sys_name] = temp_systems[natom]
+            temp_systems[group_key].append(tmp_sys)
+    for natom_key in temp_systems:
+        if atom_numbs_frame_index[natom_key] > 0:
+            mixed_systems[natom_key] = temp_systems[natom_key]
     return mixed_systems
 
 
diff --git a/dpdata/plugins/deepmd.py b/dpdata/plugins/deepmd.py
index 2726e1d46..860f52d02 100644
--- a/dpdata/plugins/deepmd.py
+++ b/dpdata/plugins/deepmd.py
@@ -117,6 +117,12 @@ class DeePMDMixedFormat(Format):
     >>> import dpdata
     >>> dpdata.MultiSystems(*systems).to_deepmd_npy_mixed("mixed_dir")
 
+    Dump with ``atom_numb_pad`` to reduce the number of subdirectories.
+    Systems are padded with virtual atoms (type -1) so that atom counts are
+    rounded up to the nearest multiple of the given number:
+
+    >>> dpdata.MultiSystems(*systems).to_deepmd_npy_mixed("mixed_dir", atom_numb_pad=8)
+
     Load a mixed type data into a MultiSystems:
 
     >>> import dpdata
@@ -156,7 +162,7 @@ def from_labeled_system_mix(self, file_name, type_map=None, **kwargs):
             file_name, type_map=type_map, labels=True
         )
 
-    def mix_system(self, *system, type_map, **kwargs):
+    def mix_system(self, *system, type_map, atom_numb_pad=None, **kwargs):
         """Mix the systems into mixed_type ones according to the unified given type_map.
 
         Parameters
@@ -165,6 +171,13 @@ def mix_system(self, *system, type_map, **kwargs):
             The systems to mix
         type_map : list of str
             Maps atom type to name
+        atom_numb_pad : int, optional
+            If provided, pad atom counts to the next multiple of this number
+            using virtual atoms (type -1 in real_atom_types). This reduces the
+            number of subdirectories when systems have many different atom counts.
+            For example, ``atom_numb_pad=8`` groups systems into multiples of 8:
+            a 5-atom system is padded to 8, a 9-atom system is padded to 16, etc.
+            Virtual atoms are transparently removed when loading the data back.
         **kwargs : dict
             other parameters
 
@@ -172,8 +185,17 @@ def mix_system(self, *system, type_map, **kwargs):
         -------
         mixed_systems: dict
             dict of mixed system with key 'atom_numbs'
+
+        Examples
+        --------
+        Dump with padding so that atom counts are rounded up to multiples of 8:
+
+        >>> import dpdata
+        >>> dpdata.MultiSystems(*systems).to_deepmd_npy_mixed("mixed_dir", atom_numb_pad=8)
         """
-        return dpdata.deepmd.mixed.mix_system(*system, type_map=type_map, **kwargs)
+        return dpdata.deepmd.mixed.mix_system(
+            *system, type_map=type_map, atom_numb_pad=atom_numb_pad, **kwargs
+        )
 
     def from_multi_systems(self, directory, **kwargs):
         register_spin()
diff --git a/tests/test_deepmd_mixed.py b/tests/test_deepmd_mixed.py
index bd8036876..10ed664d1 100644
--- a/tests/test_deepmd_mixed.py
+++ b/tests/test_deepmd_mixed.py
@@ -597,3 +597,303 @@ def test_aparam_exists(self):
                     self.systems[formula].data["aparam"],
                     decimal=self.places,
                 )
+
+
+class TestMixedMultiSystemsPadding(
+    unittest.TestCase, CompLabeledMultiSys, MultiSystems, MSAllIsNoPBC
+):
+    """Test round-trip with atom_numb_pad.
+
+    C1H4 (5 atoms) and C1H3 (4 atoms) are both padded to 8 atoms,
+    so only 1 subfolder should be created.
+    """
+
+    def setUp(self):
+        self.places = 6
+        self.e_places = 6
+        self.f_places = 6
+        self.v_places = 6
+
+        # C1H4 (5 atoms)
+        system_1 = dpdata.LabeledSystem(
+            "gaussian/methane.gaussianlog", fmt="gaussian/log"
+        )
+        # C1H3 (4 atoms)
+        system_2 = dpdata.LabeledSystem(
+            "gaussian/methane_sub.gaussianlog", fmt="gaussian/log"
+        )
+
+        self.ms = dpdata.MultiSystems(system_1, system_2)
+        self.ms.to_deepmd_npy_mixed("tmp.deepmd.mixed.pad", atom_numb_pad=8)
+        self.systems = dpdata.MultiSystems()
+        self.systems.from_deepmd_npy_mixed(
+            "tmp.deepmd.mixed.pad", fmt="deepmd/npy/mixed"
+        )
+        self.ms_1 = self.ms
+        self.ms_2 = self.systems
+
+        self.system_names = ["C1H4", "C1H3"]
+        self.system_sizes = {"C1H4": 1, "C1H3": 1}
+        self.atom_names = ["C", "H"]
+
+    def tearDown(self):
+        if os.path.exists("tmp.deepmd.mixed.pad"):
+            shutil.rmtree("tmp.deepmd.mixed.pad")
+
+    def test_single_subfolder(self):
+        """Both 4-atom and 5-atom systems padded to 8 -> 1 subfolder."""
+        subdirs = [
+            d
+            for d in os.listdir("tmp.deepmd.mixed.pad")
+            if os.path.isdir(os.path.join("tmp.deepmd.mixed.pad", d))
+        ]
+        self.assertEqual(len(subdirs), 1)
+        self.assertEqual(subdirs[0], "8")
+
+    def test_real_atom_types_on_disk(self):
+        """Verify real_atom_types.npy contains -1 for virtual atoms."""
+        mixed_sets = glob("tmp.deepmd.mixed.pad/*/set.*")
+        for s in mixed_sets:
+            rat = np.load(os.path.join(s, "real_atom_types.npy"))
+            # padded to 8, so last columns should be -1
+            self.assertTrue(np.any(rat == -1))
+            # first columns should be >= 0
+            self.assertTrue(np.all(rat[:, 0] >= 0))
+
+    def test_loaded_natoms(self):
+        """Loaded systems should have original (unpadded) atom counts."""
+        for formula, sys in self.systems.systems.items():
+            if "H4" in formula:
+                self.assertEqual(sys.get_natoms(), 5)
+            elif "H3" in formula:
+                self.assertEqual(sys.get_natoms(), 4)
+
+    def test_len(self):
+        self.assertEqual(len(self.ms), 2)
+        self.assertEqual(len(self.systems), 2)
+
+    def test_get_nframes(self):
+        self.assertEqual(self.ms.get_nframes(), 2)
+        self.assertEqual(self.systems.get_nframes(), 2)
+
+
+class TestMixedMultiSystemsPaddingMultipleGroups(
+    unittest.TestCase, CompLabeledMultiSys, MultiSystems, MSAllIsNoPBC
+):
+    """Test padding with systems that span multiple padded groups.
+
+    With atom_numb_pad=4: C1H3 (4 atoms) -> 4, C1H4 (5 atoms) -> 8.
+    Two subfolders should be created.
+    """
+
+    def setUp(self):
+        self.places = 6
+        self.e_places = 6
+        self.f_places = 6
+        self.v_places = 6
+
+        # C1H4 (5 atoms)
+        system_1 = dpdata.LabeledSystem(
+            "gaussian/methane.gaussianlog", fmt="gaussian/log"
+        )
+        # C1H3 (4 atoms)
+        system_2 = dpdata.LabeledSystem(
+            "gaussian/methane_sub.gaussianlog", fmt="gaussian/log"
+        )
+
+        self.ms = dpdata.MultiSystems(system_1, system_2)
+        self.ms.to_deepmd_npy_mixed("tmp.deepmd.mixed.pad2", atom_numb_pad=4)
+        self.systems = dpdata.MultiSystems()
+        self.systems.from_deepmd_npy_mixed(
+            "tmp.deepmd.mixed.pad2", fmt="deepmd/npy/mixed"
+        )
+        self.ms_1 = self.ms
+        self.ms_2 = self.systems
+
+        self.system_names = ["C1H4", "C1H3"]
+        self.system_sizes = {"C1H4": 1, "C1H3": 1}
+        self.atom_names = ["C", "H"]
+
+    def tearDown(self):
+        if os.path.exists("tmp.deepmd.mixed.pad2"):
+            shutil.rmtree("tmp.deepmd.mixed.pad2")
+
+    def test_two_subfolders(self):
+        """4-atom -> 4, 5-atom -> 8 => 2 subfolders."""
+        subdirs = sorted(
+            d
+            for d in os.listdir("tmp.deepmd.mixed.pad2")
+            if os.path.isdir(os.path.join("tmp.deepmd.mixed.pad2", d))
+        )
+        self.assertEqual(len(subdirs), 2)
+        self.assertIn("4", subdirs)
+        self.assertIn("8", subdirs)
+
+    def test_len(self):
+        self.assertEqual(len(self.ms), 2)
+        self.assertEqual(len(self.systems), 2)
+
+    def test_get_nframes(self):
+        self.assertEqual(self.ms.get_nframes(), 2)
+        self.assertEqual(self.systems.get_nframes(), 2)
+
+
+class TestMixedMultiSystemsPaddingTypeMap(
+    unittest.TestCase, CompLabeledMultiSys, MSAllIsNoPBC
+):
+    """Test padding + custom type_map on reload.
+
+    This verifies the index_map bug fix for -1 values in real_atom_types.
+    """
+
+    def setUp(self):
+        self.places = 6
+        self.e_places = 6
+        self.f_places = 6
+        self.v_places = 6
+
+        # C1H4 (5 atoms)
+        system_1 = dpdata.LabeledSystem(
+            "gaussian/methane.gaussianlog", fmt="gaussian/log"
+        )
+        # C1H3 (4 atoms)
+        system_2 = dpdata.LabeledSystem(
+            "gaussian/methane_sub.gaussianlog", fmt="gaussian/log"
+        )
+
+        self.ms = dpdata.MultiSystems(system_1, system_2)
+        self.ms.to_deepmd_npy_mixed("tmp.deepmd.mixed.pad.tm", atom_numb_pad=8)
+
+        new_type_map = ["H", "C"]
+        self.systems = dpdata.MultiSystems()
+        self.systems.from_deepmd_npy_mixed(
+            "tmp.deepmd.mixed.pad.tm",
+            fmt="deepmd/npy/mixed",
+            type_map=new_type_map,
+        )
+
+        # Apply same type_map to original for comparison
+        for kk in [ii.formula for ii in self.ms]:
+            self.ms[kk].apply_type_map(new_type_map)
+            tmp_ss = self.ms.systems.pop(kk)
+            self.ms.systems[tmp_ss.formula] = tmp_ss
+
+        self.ms_1 = self.ms
+        self.ms_2 = self.systems
+
+    def tearDown(self):
+        if os.path.exists("tmp.deepmd.mixed.pad.tm"):
+            shutil.rmtree("tmp.deepmd.mixed.pad.tm")
+
+    def test_len(self):
+        self.assertEqual(len(self.ms), 2)
+        self.assertEqual(len(self.systems), 2)
+
+    def test_get_nframes(self):
+        self.assertEqual(self.ms.get_nframes(), 2)
+        self.assertEqual(self.systems.get_nframes(), 2)
+
+
+class TestMixedMultiSystemsPaddingAparam(
+    unittest.TestCase, CompLabeledMultiSys, MultiSystems, MSAllIsNoPBC
+):
+    """Test padding with custom per-atom data (aparam)."""
+
+    def setUp(self):
+        self.places = 6
+        self.e_places = 6
+        self.f_places = 6
+        self.v_places = 6
+
+        new_datatypes = [
+            DataType(
+                "fparam",
+                np.ndarray,
+                shape=(Axis.NFRAMES, 2),
+                required=False,
+            ),
+            DataType(
+                "aparam",
+                np.ndarray,
+                shape=(Axis.NFRAMES, Axis.NATOMS, 3),
+                required=False,
+            ),
+        ]
+        for datatype in new_datatypes:
+            dpdata.System.register_data_type(datatype)
+            dpdata.LabeledSystem.register_data_type(datatype)
+
+        # C1H4 (5 atoms)
+        system_1 = dpdata.LabeledSystem(
+            "gaussian/methane.gaussianlog", fmt="gaussian/log"
+        )
+        # C1H3 (4 atoms)
+        system_2 = dpdata.LabeledSystem(
+            "gaussian/methane_sub.gaussianlog", fmt="gaussian/log"
+        )
+
+        tmp_data_1 = system_1.data.copy()
+        nframes_1 = tmp_data_1["coords"].shape[0]
+        natoms_1 = tmp_data_1["atom_types"].shape[0]
+        tmp_data_1["fparam"] = np.random.random([nframes_1, 2])
+        tmp_data_1["aparam"] = np.random.random([nframes_1, natoms_1, 3])
+        system_1_with_params = dpdata.LabeledSystem(data=tmp_data_1)
+
+        tmp_data_2 = system_2.data.copy()
+        nframes_2 = tmp_data_2["coords"].shape[0]
+        natoms_2 = tmp_data_2["atom_types"].shape[0]
+        tmp_data_2["fparam"] = np.random.random([nframes_2, 2])
+        tmp_data_2["aparam"] = np.random.random([nframes_2, natoms_2, 3])
+        system_2_with_params = dpdata.LabeledSystem(data=tmp_data_2)
+
+        self.ms = dpdata.MultiSystems(system_1_with_params, system_2_with_params)
+        self.ms.to_deepmd_npy_mixed("tmp.deepmd.mixed.pad.ap", atom_numb_pad=8)
+        self.systems = dpdata.MultiSystems()
+        self.systems.from_deepmd_npy_mixed(
+            "tmp.deepmd.mixed.pad.ap", fmt="deepmd/npy/mixed"
+        )
+        self.ms_1 = self.ms
+        self.ms_2 = self.systems
+
+        self.system_names = ["C1H4", "C1H3"]
+        self.system_sizes = {"C1H4": 1, "C1H3": 1}
+        self.atom_names = ["C", "H"]
+
+    def tearDown(self):
+        if os.path.exists("tmp.deepmd.mixed.pad.ap"):
+            shutil.rmtree("tmp.deepmd.mixed.pad.ap")
+
+    def test_single_subfolder(self):
+        subdirs = [
+            d
+            for d in os.listdir("tmp.deepmd.mixed.pad.ap")
+            if os.path.isdir(os.path.join("tmp.deepmd.mixed.pad.ap", d))
+        ]
+        self.assertEqual(len(subdirs), 1)
+
+    def test_fparam_preserved(self):
+        for formula in self.system_names:
+            if formula in self.ms.systems and formula in self.systems.systems:
+                np.testing.assert_almost_equal(
+                    self.ms[formula].data["fparam"],
+                    self.systems[formula].data["fparam"],
+                    decimal=self.places,
+                )
+
+    def test_aparam_preserved(self):
+        """Per-atom aparam should be correctly padded and unpadded."""
+        for formula in self.system_names:
+            if formula in self.ms.systems and formula in self.systems.systems:
+                np.testing.assert_almost_equal(
+                    self.ms[formula].data["aparam"],
+                    self.systems[formula].data["aparam"],
+                    decimal=self.places,
+                )
+
+    def test_len(self):
+        self.assertEqual(len(self.ms), 2)
+        self.assertEqual(len(self.systems), 2)
+
+    def test_get_nframes(self):
+        self.assertEqual(self.ms.get_nframes(), 2)
+        self.assertEqual(self.systems.get_nframes(), 2)

From 281ec6a1eab2a5a1f729ad6e6ed47603554b8bf9 Mon Sep 17 00:00:00 2001
From: Han Wang <wang_han@iapcm.ac.cn>
Date: Sat, 14 Feb 2026 09:16:52 +0800
Subject: [PATCH 2/2] fix bugs

---
 dpdata/deepmd/mixed.py     | 25 +-------------------
 tests/test_deepmd_mixed.py | 47 +++++++++++++++++++++++++++++++++-----
 2 files changed, 42 insertions(+), 30 deletions(-)

diff --git a/dpdata/deepmd/mixed.py b/dpdata/deepmd/mixed.py
index 3be6fae1b..734b6a730 100644
--- a/dpdata/deepmd/mixed.py
+++ b/dpdata/deepmd/mixed.py
@@ -48,23 +48,13 @@ def _pad_to(sys_data, target_natoms, dtypes):
         axis=1,
     )
 
-    # Pad coords with zeros
-    sys_data["coords"] = np.concatenate(
-        [
-            sys_data["coords"],
-            np.zeros((nframes, npad, 3), dtype=sys_data["coords"].dtype),
-        ],
-        axis=1,
-    )
-
-    # Pad all other per-atom data generically
+    # Pad coords and all other per-atom data generically
     reserved = {
         "atom_numbs",
         "atom_names",
         "atom_types",
         "orig",
         "cells",
-        "coords",
         "real_atom_names",
         "real_atom_types",
         "nopbc",
@@ -105,8 +95,6 @@ def _strip_virtual_atoms(atom_types_row, coords, extra_data, dtypes):
 
     Returns
     -------
-    real_mask : np.ndarray
-        Boolean mask of real atoms.
     atom_types : np.ndarray
         Atom types with virtual atoms removed.
     coords : np.ndarray
@@ -121,17 +109,6 @@ def _strip_virtual_atoms(atom_types_row, coords, extra_data, dtypes):
     atom_types = atom_types_row[real_mask]
     coords = coords[:, real_mask, :]
 
-    reserved = {
-        "atom_numbs",
-        "atom_names",
-        "atom_types",
-        "real_atom_names",
-        "real_atom_types",
-        "cells",
-        "coords",
-        "orig",
-        "nopbc",
-    }
     stripped = {}
     for name, arr in extra_data.items():
         for dtype in dtypes:
diff --git a/tests/test_deepmd_mixed.py b/tests/test_deepmd_mixed.py
index 10ed664d1..d5b0dec64 100644
--- a/tests/test_deepmd_mixed.py
+++ b/tests/test_deepmd_mixed.py
@@ -650,15 +650,31 @@ def test_single_subfolder(self):
         self.assertEqual(len(subdirs), 1)
         self.assertEqual(subdirs[0], "8")
 
-    def test_real_atom_types_on_disk(self):
-        """Verify real_atom_types.npy contains -1 for virtual atoms."""
+    def test_padded_virtual_atoms(self):
+        """Verify on-disk real atom count matches loaded natoms, and virtual
+        atoms have type -1 with zero coords and forces.
+        """
+        loaded_natoms = {f: s.get_natoms() for f, s in self.systems.systems.items()}
         mixed_sets = glob("tmp.deepmd.mixed.pad/*/set.*")
+        self.assertGreater(len(mixed_sets), 0)
         for s in mixed_sets:
             rat = np.load(os.path.join(s, "real_atom_types.npy"))
-            # padded to 8, so last columns should be -1
-            self.assertTrue(np.any(rat == -1))
-            # first columns should be >= 0
-            self.assertTrue(np.all(rat[:, 0] >= 0))
+            coord = np.load(os.path.join(s, "coord.npy"))
+            force = np.load(os.path.join(s, "force.npy"))
+            padded_natoms = rat.shape[1]
+            for ii in range(rat.shape[0]):
+                row = rat[ii]
+                n_real = int(np.sum(row >= 0))
+                # on-disk real atom count must match one of the loaded systems
+                self.assertIn(n_real, loaded_natoms.values())
+                # real atoms first, then virtual atoms
+                np.testing.assert_array_equal(row[:n_real] >= 0, True)
+                np.testing.assert_array_equal(row[n_real:], -1)
+                # virtual atom coords and forces must be zero
+                coord_frame = coord[ii].reshape(padded_natoms, 3)
+                np.testing.assert_array_equal(coord_frame[n_real:], 0.0)
+                force_frame = force[ii].reshape(padded_natoms, 3)
+                np.testing.assert_array_equal(force_frame[n_real:], 0.0)
 
     def test_loaded_natoms(self):
         """Loaded systems should have original (unpadded) atom counts."""
@@ -667,6 +683,8 @@ def test_loaded_natoms(self):
                 self.assertEqual(sys.get_natoms(), 5)
             elif "H3" in formula:
                 self.assertEqual(sys.get_natoms(), 4)
+            # no virtual atoms should remain in loaded data
+            self.assertTrue(np.all(sys.data["atom_types"] >= 0))
 
     def test_len(self):
         self.assertEqual(len(self.ms), 2)
@@ -890,6 +908,23 @@ def test_aparam_preserved(self):
                     decimal=self.places,
                 )
 
+    def test_virtual_atoms_zero_on_disk(self):
+        """Verify virtual atoms have zero aparam on disk."""
+        loaded_natoms = {f: s.get_natoms() for f, s in self.systems.systems.items()}
+        mixed_sets = glob("tmp.deepmd.mixed.pad.ap/*/set.*")
+        self.assertGreater(len(mixed_sets), 0)
+        for s in mixed_sets:
+            rat = np.load(os.path.join(s, "real_atom_types.npy"))
+            aparam = np.load(os.path.join(s, "aparam.npy"))
+            padded_natoms = rat.shape[1]
+            for ii in range(rat.shape[0]):
+                row = rat[ii]
+                n_real = int(np.sum(row >= 0))
+                self.assertIn(n_real, loaded_natoms.values())
+                # aparam shape on disk: (nframes, padded_natoms * 3)
+                aparam_frame = aparam[ii].reshape(padded_natoms, 3)
+                np.testing.assert_array_equal(aparam_frame[n_real:], 0.0)
+
     def test_len(self):
         self.assertEqual(len(self.ms), 2)
         self.assertEqual(len(self.systems), 2)