Hi OCP Team!
When I was using the code of ocpmodels.dataset.lmdb_dataset_creation.ipynb, I found that the example you gave was to use the bulk built by yourself to carry out MD to obtain traj files, while vasp has a lot of intermediate structure, energy and force in relaxation calculation. Therefore, I want to convert my own data into lmdb data for training (IS2RE/IS2RS). But in this process, the lmdbdataset SinglePointLmdbDataset has been abandoned. So I wanted to convert it into the OC22lmdb-dataset dataset, but there was some problem here. I converted the generated lmdb file into the OC22LmdbDataset class, which had no training content.
My code is :
from ocpmodels.preprocessing import AtomsToGraphs
#from ocpmodels.datasets import SinglePointLmdbDataset
from ocpmodels.datasets.oc22_lmdb_dataset import OC22LmdbDataset
from ase.io import read
import lmdb
import pickle
import torch
ase_data = read(str("D:\data\ldh\\try\\train\O\\0\OUTCAR"), index=':', format="vasp-out")
print(len(ase_data))
a2g = AtomsToGraphs(
max_neigh=200,
radius=6,
r_energy=True, # False for test data
r_forces=True, # False for test data
r_distances=False,
r_fixed=True,
)
db = lmdb.open(
"D:\data\ldh\lmdbdata\\train\CrMnFeCu.lmdb",
subdir=False,
meminit=False,
map_async=True,
)
def read_trajectory_extract_features(a2g, traj_path):
traj = read(traj_path, index=":", format="vasp-out")
tags = traj[0].get_tags()
images = [traj[0], traj[-1]]
data_objects = a2g.convert_all(images, disable_tqdm=True)
data_objects[0].tags = torch.LongTensor(tags)
data_objects[1].tags = torch.LongTensor(tags)
return data_objects
dir_path = ['D:\data\ldh\\try\\train\O\\0\OUTCAR']
idx = 0
for system in dir_path:
# Extract Data object
data_objects = read_trajectory_extract_features(a2g, system)
initial_struc = data_objects[0]
relaxed_struc = data_objects[1]
initial_struc.y_init = initial_struc.y # subtract off reference energy, if applicable
del initial_struc.y
initial_struc.y_relaxed = relaxed_struc.y # subtract off reference energy, if applicable
initial_struc.pos_relaxed = relaxed_struc.pos
# Filter data if necessary
# OCP filters adsorption energies > |10| eV
initial_struc.sid = idx # arbitrary unique identifier
# no neighbor edge case check
if initial_struc.edge_index.shape[1] == 0:
print("no neighbors", system)
continue
# Write to LMDB
txn = db.begin(write=True)
txn.put(f"{idx}".encode("ascii"), pickle.dumps(initial_struc, protocol=-1))
txn.commit()
db.sync()
idx += 1
db.close()
dir = 'D:\data\ldh\lmdbdata\\train\CrMnFeCu.lmdb'
config = {
'src': dir,
'data2train': 'data2train',
}
dataset = OC22LmdbDataset(config)
print(dataset)
# <ocpmodels.datasets.oc22_lmdb_dataset.OC22LmdbDataset object at 0x00000254F06919A0>
print(len(dataset))
# 1
dataset = OC22LmdbDataset(config), I got the dataset but it can’t run in the EquiformerV2 model.