From 0695ba28649c00ebd94ae3564d8eb6637446a990 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Fri, 13 Feb 2026 20:46:52 +0800 Subject: [PATCH 1/2] feat: support natom padding in deepmd/npy/mixed format --- dpdata/deepmd/mixed.py | 209 +++++++++++++++++++++++--- dpdata/plugins/deepmd.py | 26 +++- tests/test_deepmd_mixed.py | 300 +++++++++++++++++++++++++++++++++++++ 3 files changed, 511 insertions(+), 24 deletions(-) diff --git a/dpdata/deepmd/mixed.py b/dpdata/deepmd/mixed.py index 38e8b386b..3be6fae1b 100644 --- a/dpdata/deepmd/mixed.py +++ b/dpdata/deepmd/mixed.py @@ -1,15 +1,151 @@ from __future__ import annotations import copy +import math import numpy as np import dpdata +from dpdata.data_type import Axis from .comp import dump as comp_dump from .comp import to_system_data as comp_to_system_data +def _pad_to(sys_data, target_natoms, dtypes): + """Pad system data dict so that NATOMS dimension becomes target_natoms. + + Virtual atoms get real_atom_types = -1, and all other per-atom data is + padded with zeros. + + Parameters + ---------- + sys_data : dict + System data dict, already in mixed-type format. + target_natoms : int + Target number of atoms after padding. + dtypes : tuple[DataType, ...] + Registered data types to iterate for generic per-atom padding. + """ + natoms = sys_data["atom_types"].shape[0] + npad = target_natoms - natoms + if npad <= 0: + return + nframes = sys_data["coords"].shape[0] + + # Pad atom_types (all MIXED_TOKEN = 0) + sys_data["atom_types"] = np.concatenate( + [sys_data["atom_types"], np.zeros(npad, dtype=int)] + ) + sys_data["atom_numbs"] = [target_natoms] + + # Pad real_atom_types with -1 (virtual atom sentinel) + sys_data["real_atom_types"] = np.concatenate( + [ + sys_data["real_atom_types"], + -np.ones((nframes, npad), dtype=sys_data["real_atom_types"].dtype), + ], + axis=1, + ) + + # Pad coords with zeros + sys_data["coords"] = np.concatenate( + [ + sys_data["coords"], + np.zeros((nframes, npad, 3), dtype=sys_data["coords"].dtype), + ], + axis=1, + ) + + # Pad all other per-atom data generically + reserved = { + "atom_numbs", + "atom_names", + "atom_types", + "orig", + "cells", + "coords", + "real_atom_names", + "real_atom_types", + "nopbc", + } + for dtype in dtypes: + if dtype.name in reserved: + continue + if dtype.name not in sys_data: + continue + if not ( + len(dtype.shape) >= 2 + and dtype.shape[0] == Axis.NFRAMES + and Axis.NATOMS in dtype.shape + ): + continue + axis_natoms = list(dtype.shape).index(Axis.NATOMS) + arr = sys_data[dtype.name] + pad_width = [(0, 0)] * len(arr.shape) + pad_width[axis_natoms] = (0, npad) + sys_data[dtype.name] = np.pad( + arr, pad_width, mode="constant", constant_values=0 + ) + + +def _strip_virtual_atoms(atom_types_row, coords, extra_data, dtypes): + """Strip virtual atoms (type -1) from a group of frames. + + Parameters + ---------- + atom_types_row : np.ndarray + 1-D array of atom type indices for the group (same for all frames). + coords : np.ndarray + Coordinates array, shape (nframes, natoms_padded, 3). + extra_data : dict + Dict of {name: array} for this group, arrays already frame-sliced. + dtypes : tuple[DataType, ...] + Registered data types. + + Returns + ------- + real_mask : np.ndarray + Boolean mask of real atoms. + atom_types : np.ndarray + Atom types with virtual atoms removed. + coords : np.ndarray + Coords with virtual atoms removed. + extra_data : dict + Extra data with virtual atoms removed. + """ + real_mask = atom_types_row >= 0 + if real_mask.all(): + return atom_types_row, coords, extra_data + + atom_types = atom_types_row[real_mask] + coords = coords[:, real_mask, :] + + reserved = { + "atom_numbs", + "atom_names", + "atom_types", + "real_atom_names", + "real_atom_types", + "cells", + "coords", + "orig", + "nopbc", + } + stripped = {} + for name, arr in extra_data.items(): + for dtype in dtypes: + if dtype.name == name and Axis.NATOMS in dtype.shape: + axis_natoms = list(dtype.shape).index(Axis.NATOMS) + idx = [slice(None)] * len(arr.shape) + idx[axis_natoms] = real_mask + arr = arr[tuple(idx)] + break + stripped[name] = arr + + return atom_types, coords, stripped + + def to_system_data(folder, type_map=None, labels=True): data = comp_to_system_data(folder, type_map, labels) # data is empty @@ -26,7 +162,11 @@ def to_system_data(folder, type_map=None, labels=True): index_map = None all_real_atom_types_concat = data.pop("real_atom_types").astype(int) if index_map is not None: - all_real_atom_types_concat = index_map[all_real_atom_types_concat] + # Preserve -1 (virtual atom sentinel) during remapping + valid = all_real_atom_types_concat >= 0 + remapped = np.full_like(all_real_atom_types_concat, -1) + remapped[valid] = index_map[all_real_atom_types_concat[valid]] + all_real_atom_types_concat = remapped all_cells_concat = data["cells"] all_coords_concat = data["coords"] @@ -60,10 +200,6 @@ def to_system_data(folder, type_map=None, labels=True): while True: if all_real_atom_types_concat.size == 0: break - temp_atom_numbs = [ - np.count_nonzero(all_real_atom_types_concat[0] == i) - for i in range(len(data["atom_names"])) - ] # temp_formula = formula(data['atom_names'], temp_atom_numbs) temp_idx = np.arange(all_real_atom_types_concat.shape[0])[ (all_real_atom_types_concat == all_real_atom_types_concat[0]).all(-1) @@ -71,20 +207,37 @@ def to_system_data(folder, type_map=None, labels=True): rest_idx = np.arange(all_real_atom_types_concat.shape[0])[ (all_real_atom_types_concat != all_real_atom_types_concat[0]).any(-1) ] + + # Extract data for this group + group_atom_types = all_real_atom_types_concat[0] + group_coords = all_coords_concat[temp_idx] + group_extra = {} + for name in extra_data: + group_extra[name] = extra_data[name][temp_idx] + extra_data[name] = extra_data[name][rest_idx] + + # Strip virtual atoms (type -1) introduced by padding + group_atom_types, group_coords, group_extra = _strip_virtual_atoms( + group_atom_types, group_coords, group_extra, dtypes + ) + + temp_atom_numbs = [ + np.count_nonzero(group_atom_types == i) + for i in range(len(data["atom_names"])) + ] + temp_data = data.copy() temp_data["atom_names"] = data["atom_names"].copy() temp_data["atom_numbs"] = temp_atom_numbs - temp_data["atom_types"] = all_real_atom_types_concat[0] + temp_data["atom_types"] = group_atom_types all_real_atom_types_concat = all_real_atom_types_concat[rest_idx] temp_data["cells"] = all_cells_concat[temp_idx] all_cells_concat = all_cells_concat[rest_idx] - temp_data["coords"] = all_coords_concat[temp_idx] + temp_data["coords"] = group_coords all_coords_concat = all_coords_concat[rest_idx] - for name in extra_data: - all_dtype_concat = extra_data[name] - temp_data[name] = all_dtype_concat[temp_idx] - extra_data[name] = all_dtype_concat[rest_idx] + for name in group_extra: + temp_data[name] = group_extra[name] data_list.append(temp_data) return data_list @@ -109,7 +262,7 @@ def dump(folder, data, set_size=2000, comp_prec=np.float32, remove_sets=True): comp_dump(folder, data, set_size, comp_prec, remove_sets) -def mix_system(*system, type_map, **kwargs): +def mix_system(*system, type_map, atom_numb_pad=None, **kwargs): """Mix the systems into mixed_type ones according to the unified given type_map. Parameters @@ -118,6 +271,11 @@ def mix_system(*system, type_map, **kwargs): The systems to mix type_map : list of str Maps atom type to name + atom_numb_pad : int, optional + If provided, pad atom counts to the next multiple of this number + using virtual atoms (type -1 in real_atom_types). This reduces the + number of subdirectories when systems have many different atom counts. + For example, atom_numb_pad=8 groups systems into multiples of 8. **kwargs : dict Other parameters @@ -129,21 +287,28 @@ def mix_system(*system, type_map, **kwargs): mixed_systems = {} temp_systems = {} atom_numbs_frame_index = {} # index of frames in cur sys + # Use LabeledSystem DTYPES as superset for generic per-atom padding + dtypes = dpdata.system.LabeledSystem.DTYPES for sys in system: tmp_sys = sys.copy() natom = tmp_sys.get_natoms() tmp_sys.convert_to_mixed_type(type_map=type_map) - if str(natom) not in atom_numbs_frame_index: - atom_numbs_frame_index[str(natom)] = 0 - atom_numbs_frame_index[str(natom)] += tmp_sys.get_nframes() - if str(natom) not in temp_systems or not temp_systems[str(natom)]: - temp_systems[str(natom)] = tmp_sys + if atom_numb_pad is not None and atom_numb_pad > 1: + padded_natom = math.ceil(natom / atom_numb_pad) * atom_numb_pad + _pad_to(tmp_sys.data, padded_natom, dtypes) + group_key = str(padded_natom) + else: + group_key = str(natom) + if group_key not in atom_numbs_frame_index: + atom_numbs_frame_index[group_key] = 0 + atom_numbs_frame_index[group_key] += tmp_sys.get_nframes() + if group_key not in temp_systems or not temp_systems[group_key]: + temp_systems[group_key] = tmp_sys else: - temp_systems[str(natom)].append(tmp_sys) - for natom in temp_systems: - if atom_numbs_frame_index[natom] > 0: - sys_name = f"{natom}" - mixed_systems[sys_name] = temp_systems[natom] + temp_systems[group_key].append(tmp_sys) + for natom_key in temp_systems: + if atom_numbs_frame_index[natom_key] > 0: + mixed_systems[natom_key] = temp_systems[natom_key] return mixed_systems diff --git a/dpdata/plugins/deepmd.py b/dpdata/plugins/deepmd.py index 2726e1d46..860f52d02 100644 --- a/dpdata/plugins/deepmd.py +++ b/dpdata/plugins/deepmd.py @@ -117,6 +117,12 @@ class DeePMDMixedFormat(Format): >>> import dpdata >>> dpdata.MultiSystems(*systems).to_deepmd_npy_mixed("mixed_dir") + Dump with ``atom_numb_pad`` to reduce the number of subdirectories. + Systems are padded with virtual atoms (type -1) so that atom counts are + rounded up to the nearest multiple of the given number: + + >>> dpdata.MultiSystems(*systems).to_deepmd_npy_mixed("mixed_dir", atom_numb_pad=8) + Load a mixed type data into a MultiSystems: >>> import dpdata @@ -156,7 +162,7 @@ def from_labeled_system_mix(self, file_name, type_map=None, **kwargs): file_name, type_map=type_map, labels=True ) - def mix_system(self, *system, type_map, **kwargs): + def mix_system(self, *system, type_map, atom_numb_pad=None, **kwargs): """Mix the systems into mixed_type ones according to the unified given type_map. Parameters @@ -165,6 +171,13 @@ def mix_system(self, *system, type_map, **kwargs): The systems to mix type_map : list of str Maps atom type to name + atom_numb_pad : int, optional + If provided, pad atom counts to the next multiple of this number + using virtual atoms (type -1 in real_atom_types). This reduces the + number of subdirectories when systems have many different atom counts. + For example, ``atom_numb_pad=8`` groups systems into multiples of 8: + a 5-atom system is padded to 8, a 9-atom system is padded to 16, etc. + Virtual atoms are transparently removed when loading the data back. **kwargs : dict other parameters @@ -172,8 +185,17 @@ def mix_system(self, *system, type_map, **kwargs): ------- mixed_systems: dict dict of mixed system with key 'atom_numbs' + + Examples + -------- + Dump with padding so that atom counts are rounded up to multiples of 8: + + >>> import dpdata + >>> dpdata.MultiSystems(*systems).to_deepmd_npy_mixed("mixed_dir", atom_numb_pad=8) """ - return dpdata.deepmd.mixed.mix_system(*system, type_map=type_map, **kwargs) + return dpdata.deepmd.mixed.mix_system( + *system, type_map=type_map, atom_numb_pad=atom_numb_pad, **kwargs + ) def from_multi_systems(self, directory, **kwargs): register_spin() diff --git a/tests/test_deepmd_mixed.py b/tests/test_deepmd_mixed.py index bd8036876..10ed664d1 100644 --- a/tests/test_deepmd_mixed.py +++ b/tests/test_deepmd_mixed.py @@ -597,3 +597,303 @@ def test_aparam_exists(self): self.systems[formula].data["aparam"], decimal=self.places, ) + + +class TestMixedMultiSystemsPadding( + unittest.TestCase, CompLabeledMultiSys, MultiSystems, MSAllIsNoPBC +): + """Test round-trip with atom_numb_pad. + + C1H4 (5 atoms) and C1H3 (4 atoms) are both padded to 8 atoms, + so only 1 subfolder should be created. + """ + + def setUp(self): + self.places = 6 + self.e_places = 6 + self.f_places = 6 + self.v_places = 6 + + # C1H4 (5 atoms) + system_1 = dpdata.LabeledSystem( + "gaussian/methane.gaussianlog", fmt="gaussian/log" + ) + # C1H3 (4 atoms) + system_2 = dpdata.LabeledSystem( + "gaussian/methane_sub.gaussianlog", fmt="gaussian/log" + ) + + self.ms = dpdata.MultiSystems(system_1, system_2) + self.ms.to_deepmd_npy_mixed("tmp.deepmd.mixed.pad", atom_numb_pad=8) + self.systems = dpdata.MultiSystems() + self.systems.from_deepmd_npy_mixed( + "tmp.deepmd.mixed.pad", fmt="deepmd/npy/mixed" + ) + self.ms_1 = self.ms + self.ms_2 = self.systems + + self.system_names = ["C1H4", "C1H3"] + self.system_sizes = {"C1H4": 1, "C1H3": 1} + self.atom_names = ["C", "H"] + + def tearDown(self): + if os.path.exists("tmp.deepmd.mixed.pad"): + shutil.rmtree("tmp.deepmd.mixed.pad") + + def test_single_subfolder(self): + """Both 4-atom and 5-atom systems padded to 8 -> 1 subfolder.""" + subdirs = [ + d + for d in os.listdir("tmp.deepmd.mixed.pad") + if os.path.isdir(os.path.join("tmp.deepmd.mixed.pad", d)) + ] + self.assertEqual(len(subdirs), 1) + self.assertEqual(subdirs[0], "8") + + def test_real_atom_types_on_disk(self): + """Verify real_atom_types.npy contains -1 for virtual atoms.""" + mixed_sets = glob("tmp.deepmd.mixed.pad/*/set.*") + for s in mixed_sets: + rat = np.load(os.path.join(s, "real_atom_types.npy")) + # padded to 8, so last columns should be -1 + self.assertTrue(np.any(rat == -1)) + # first columns should be >= 0 + self.assertTrue(np.all(rat[:, 0] >= 0)) + + def test_loaded_natoms(self): + """Loaded systems should have original (unpadded) atom counts.""" + for formula, sys in self.systems.systems.items(): + if "H4" in formula: + self.assertEqual(sys.get_natoms(), 5) + elif "H3" in formula: + self.assertEqual(sys.get_natoms(), 4) + + def test_len(self): + self.assertEqual(len(self.ms), 2) + self.assertEqual(len(self.systems), 2) + + def test_get_nframes(self): + self.assertEqual(self.ms.get_nframes(), 2) + self.assertEqual(self.systems.get_nframes(), 2) + + +class TestMixedMultiSystemsPaddingMultipleGroups( + unittest.TestCase, CompLabeledMultiSys, MultiSystems, MSAllIsNoPBC +): + """Test padding with systems that span multiple padded groups. + + With atom_numb_pad=4: C1H3 (4 atoms) -> 4, C1H4 (5 atoms) -> 8. + Two subfolders should be created. + """ + + def setUp(self): + self.places = 6 + self.e_places = 6 + self.f_places = 6 + self.v_places = 6 + + # C1H4 (5 atoms) + system_1 = dpdata.LabeledSystem( + "gaussian/methane.gaussianlog", fmt="gaussian/log" + ) + # C1H3 (4 atoms) + system_2 = dpdata.LabeledSystem( + "gaussian/methane_sub.gaussianlog", fmt="gaussian/log" + ) + + self.ms = dpdata.MultiSystems(system_1, system_2) + self.ms.to_deepmd_npy_mixed("tmp.deepmd.mixed.pad2", atom_numb_pad=4) + self.systems = dpdata.MultiSystems() + self.systems.from_deepmd_npy_mixed( + "tmp.deepmd.mixed.pad2", fmt="deepmd/npy/mixed" + ) + self.ms_1 = self.ms + self.ms_2 = self.systems + + self.system_names = ["C1H4", "C1H3"] + self.system_sizes = {"C1H4": 1, "C1H3": 1} + self.atom_names = ["C", "H"] + + def tearDown(self): + if os.path.exists("tmp.deepmd.mixed.pad2"): + shutil.rmtree("tmp.deepmd.mixed.pad2") + + def test_two_subfolders(self): + """4-atom -> 4, 5-atom -> 8 => 2 subfolders.""" + subdirs = sorted( + d + for d in os.listdir("tmp.deepmd.mixed.pad2") + if os.path.isdir(os.path.join("tmp.deepmd.mixed.pad2", d)) + ) + self.assertEqual(len(subdirs), 2) + self.assertIn("4", subdirs) + self.assertIn("8", subdirs) + + def test_len(self): + self.assertEqual(len(self.ms), 2) + self.assertEqual(len(self.systems), 2) + + def test_get_nframes(self): + self.assertEqual(self.ms.get_nframes(), 2) + self.assertEqual(self.systems.get_nframes(), 2) + + +class TestMixedMultiSystemsPaddingTypeMap( + unittest.TestCase, CompLabeledMultiSys, MSAllIsNoPBC +): + """Test padding + custom type_map on reload. + + This verifies the index_map bug fix for -1 values in real_atom_types. + """ + + def setUp(self): + self.places = 6 + self.e_places = 6 + self.f_places = 6 + self.v_places = 6 + + # C1H4 (5 atoms) + system_1 = dpdata.LabeledSystem( + "gaussian/methane.gaussianlog", fmt="gaussian/log" + ) + # C1H3 (4 atoms) + system_2 = dpdata.LabeledSystem( + "gaussian/methane_sub.gaussianlog", fmt="gaussian/log" + ) + + self.ms = dpdata.MultiSystems(system_1, system_2) + self.ms.to_deepmd_npy_mixed("tmp.deepmd.mixed.pad.tm", atom_numb_pad=8) + + new_type_map = ["H", "C"] + self.systems = dpdata.MultiSystems() + self.systems.from_deepmd_npy_mixed( + "tmp.deepmd.mixed.pad.tm", + fmt="deepmd/npy/mixed", + type_map=new_type_map, + ) + + # Apply same type_map to original for comparison + for kk in [ii.formula for ii in self.ms]: + self.ms[kk].apply_type_map(new_type_map) + tmp_ss = self.ms.systems.pop(kk) + self.ms.systems[tmp_ss.formula] = tmp_ss + + self.ms_1 = self.ms + self.ms_2 = self.systems + + def tearDown(self): + if os.path.exists("tmp.deepmd.mixed.pad.tm"): + shutil.rmtree("tmp.deepmd.mixed.pad.tm") + + def test_len(self): + self.assertEqual(len(self.ms), 2) + self.assertEqual(len(self.systems), 2) + + def test_get_nframes(self): + self.assertEqual(self.ms.get_nframes(), 2) + self.assertEqual(self.systems.get_nframes(), 2) + + +class TestMixedMultiSystemsPaddingAparam( + unittest.TestCase, CompLabeledMultiSys, MultiSystems, MSAllIsNoPBC +): + """Test padding with custom per-atom data (aparam).""" + + def setUp(self): + self.places = 6 + self.e_places = 6 + self.f_places = 6 + self.v_places = 6 + + new_datatypes = [ + DataType( + "fparam", + np.ndarray, + shape=(Axis.NFRAMES, 2), + required=False, + ), + DataType( + "aparam", + np.ndarray, + shape=(Axis.NFRAMES, Axis.NATOMS, 3), + required=False, + ), + ] + for datatype in new_datatypes: + dpdata.System.register_data_type(datatype) + dpdata.LabeledSystem.register_data_type(datatype) + + # C1H4 (5 atoms) + system_1 = dpdata.LabeledSystem( + "gaussian/methane.gaussianlog", fmt="gaussian/log" + ) + # C1H3 (4 atoms) + system_2 = dpdata.LabeledSystem( + "gaussian/methane_sub.gaussianlog", fmt="gaussian/log" + ) + + tmp_data_1 = system_1.data.copy() + nframes_1 = tmp_data_1["coords"].shape[0] + natoms_1 = tmp_data_1["atom_types"].shape[0] + tmp_data_1["fparam"] = np.random.random([nframes_1, 2]) + tmp_data_1["aparam"] = np.random.random([nframes_1, natoms_1, 3]) + system_1_with_params = dpdata.LabeledSystem(data=tmp_data_1) + + tmp_data_2 = system_2.data.copy() + nframes_2 = tmp_data_2["coords"].shape[0] + natoms_2 = tmp_data_2["atom_types"].shape[0] + tmp_data_2["fparam"] = np.random.random([nframes_2, 2]) + tmp_data_2["aparam"] = np.random.random([nframes_2, natoms_2, 3]) + system_2_with_params = dpdata.LabeledSystem(data=tmp_data_2) + + self.ms = dpdata.MultiSystems(system_1_with_params, system_2_with_params) + self.ms.to_deepmd_npy_mixed("tmp.deepmd.mixed.pad.ap", atom_numb_pad=8) + self.systems = dpdata.MultiSystems() + self.systems.from_deepmd_npy_mixed( + "tmp.deepmd.mixed.pad.ap", fmt="deepmd/npy/mixed" + ) + self.ms_1 = self.ms + self.ms_2 = self.systems + + self.system_names = ["C1H4", "C1H3"] + self.system_sizes = {"C1H4": 1, "C1H3": 1} + self.atom_names = ["C", "H"] + + def tearDown(self): + if os.path.exists("tmp.deepmd.mixed.pad.ap"): + shutil.rmtree("tmp.deepmd.mixed.pad.ap") + + def test_single_subfolder(self): + subdirs = [ + d + for d in os.listdir("tmp.deepmd.mixed.pad.ap") + if os.path.isdir(os.path.join("tmp.deepmd.mixed.pad.ap", d)) + ] + self.assertEqual(len(subdirs), 1) + + def test_fparam_preserved(self): + for formula in self.system_names: + if formula in self.ms.systems and formula in self.systems.systems: + np.testing.assert_almost_equal( + self.ms[formula].data["fparam"], + self.systems[formula].data["fparam"], + decimal=self.places, + ) + + def test_aparam_preserved(self): + """Per-atom aparam should be correctly padded and unpadded.""" + for formula in self.system_names: + if formula in self.ms.systems and formula in self.systems.systems: + np.testing.assert_almost_equal( + self.ms[formula].data["aparam"], + self.systems[formula].data["aparam"], + decimal=self.places, + ) + + def test_len(self): + self.assertEqual(len(self.ms), 2) + self.assertEqual(len(self.systems), 2) + + def test_get_nframes(self): + self.assertEqual(self.ms.get_nframes(), 2) + self.assertEqual(self.systems.get_nframes(), 2) From 281ec6a1eab2a5a1f729ad6e6ed47603554b8bf9 Mon Sep 17 00:00:00 2001 From: Han Wang Date: Sat, 14 Feb 2026 09:16:52 +0800 Subject: [PATCH 2/2] fix bugs --- dpdata/deepmd/mixed.py | 25 +------------------- tests/test_deepmd_mixed.py | 47 +++++++++++++++++++++++++++++++++----- 2 files changed, 42 insertions(+), 30 deletions(-) diff --git a/dpdata/deepmd/mixed.py b/dpdata/deepmd/mixed.py index 3be6fae1b..734b6a730 100644 --- a/dpdata/deepmd/mixed.py +++ b/dpdata/deepmd/mixed.py @@ -48,23 +48,13 @@ def _pad_to(sys_data, target_natoms, dtypes): axis=1, ) - # Pad coords with zeros - sys_data["coords"] = np.concatenate( - [ - sys_data["coords"], - np.zeros((nframes, npad, 3), dtype=sys_data["coords"].dtype), - ], - axis=1, - ) - - # Pad all other per-atom data generically + # Pad coords and all other per-atom data generically reserved = { "atom_numbs", "atom_names", "atom_types", "orig", "cells", - "coords", "real_atom_names", "real_atom_types", "nopbc", @@ -105,8 +95,6 @@ def _strip_virtual_atoms(atom_types_row, coords, extra_data, dtypes): Returns ------- - real_mask : np.ndarray - Boolean mask of real atoms. atom_types : np.ndarray Atom types with virtual atoms removed. coords : np.ndarray @@ -121,17 +109,6 @@ def _strip_virtual_atoms(atom_types_row, coords, extra_data, dtypes): atom_types = atom_types_row[real_mask] coords = coords[:, real_mask, :] - reserved = { - "atom_numbs", - "atom_names", - "atom_types", - "real_atom_names", - "real_atom_types", - "cells", - "coords", - "orig", - "nopbc", - } stripped = {} for name, arr in extra_data.items(): for dtype in dtypes: diff --git a/tests/test_deepmd_mixed.py b/tests/test_deepmd_mixed.py index 10ed664d1..d5b0dec64 100644 --- a/tests/test_deepmd_mixed.py +++ b/tests/test_deepmd_mixed.py @@ -650,15 +650,31 @@ def test_single_subfolder(self): self.assertEqual(len(subdirs), 1) self.assertEqual(subdirs[0], "8") - def test_real_atom_types_on_disk(self): - """Verify real_atom_types.npy contains -1 for virtual atoms.""" + def test_padded_virtual_atoms(self): + """Verify on-disk real atom count matches loaded natoms, and virtual + atoms have type -1 with zero coords and forces. + """ + loaded_natoms = {f: s.get_natoms() for f, s in self.systems.systems.items()} mixed_sets = glob("tmp.deepmd.mixed.pad/*/set.*") + self.assertGreater(len(mixed_sets), 0) for s in mixed_sets: rat = np.load(os.path.join(s, "real_atom_types.npy")) - # padded to 8, so last columns should be -1 - self.assertTrue(np.any(rat == -1)) - # first columns should be >= 0 - self.assertTrue(np.all(rat[:, 0] >= 0)) + coord = np.load(os.path.join(s, "coord.npy")) + force = np.load(os.path.join(s, "force.npy")) + padded_natoms = rat.shape[1] + for ii in range(rat.shape[0]): + row = rat[ii] + n_real = int(np.sum(row >= 0)) + # on-disk real atom count must match one of the loaded systems + self.assertIn(n_real, loaded_natoms.values()) + # real atoms first, then virtual atoms + np.testing.assert_array_equal(row[:n_real] >= 0, True) + np.testing.assert_array_equal(row[n_real:], -1) + # virtual atom coords and forces must be zero + coord_frame = coord[ii].reshape(padded_natoms, 3) + np.testing.assert_array_equal(coord_frame[n_real:], 0.0) + force_frame = force[ii].reshape(padded_natoms, 3) + np.testing.assert_array_equal(force_frame[n_real:], 0.0) def test_loaded_natoms(self): """Loaded systems should have original (unpadded) atom counts.""" @@ -667,6 +683,8 @@ def test_loaded_natoms(self): self.assertEqual(sys.get_natoms(), 5) elif "H3" in formula: self.assertEqual(sys.get_natoms(), 4) + # no virtual atoms should remain in loaded data + self.assertTrue(np.all(sys.data["atom_types"] >= 0)) def test_len(self): self.assertEqual(len(self.ms), 2) @@ -890,6 +908,23 @@ def test_aparam_preserved(self): decimal=self.places, ) + def test_virtual_atoms_zero_on_disk(self): + """Verify virtual atoms have zero aparam on disk.""" + loaded_natoms = {f: s.get_natoms() for f, s in self.systems.systems.items()} + mixed_sets = glob("tmp.deepmd.mixed.pad.ap/*/set.*") + self.assertGreater(len(mixed_sets), 0) + for s in mixed_sets: + rat = np.load(os.path.join(s, "real_atom_types.npy")) + aparam = np.load(os.path.join(s, "aparam.npy")) + padded_natoms = rat.shape[1] + for ii in range(rat.shape[0]): + row = rat[ii] + n_real = int(np.sum(row >= 0)) + self.assertIn(n_real, loaded_natoms.values()) + # aparam shape on disk: (nframes, padded_natoms * 3) + aparam_frame = aparam[ii].reshape(padded_natoms, 3) + np.testing.assert_array_equal(aparam_frame[n_real:], 0.0) + def test_len(self): self.assertEqual(len(self.ms), 2) self.assertEqual(len(self.systems), 2)