from typing import List, Optional, Tuple, Union
import pandas as pd
try:
from mp_api.client import MPRester
except Exception as e:
print(e)
print("Falling back to from mp_api import MPRester")
from mp_api import MPRester
from mp_api.client.core.client import DEFAULT_API_KEY
from tqdm import tqdm
from typing_extensions import Literal
from matbench_genmetrics.mp_time_split.utils.data import (
get_discovery_dict,
noble,
radioactive,
)
# ensure match between following and `Literal` type hint for `exclude_elements`
AVAILABLE_EXCLUDE_STRS = ["noble", "radioactive", "noble+radioactive"]
[docs]
def fetch_data(
api_key: Union[str, DEFAULT_API_KEY] = DEFAULT_API_KEY,
fields: Optional[List[str]] = [
"structure",
"material_id",
"theoretical",
"energy_above_hull",
"formation_energy_per_atom",
],
num_sites: Optional[Tuple[int, int]] = None,
elements: Optional[List[str]] = None,
exclude_elements: Optional[
Union[List[str], Literal["noble", "radioactive", "noble+radioactive"]]
] = None,
use_theoretical: bool = False,
return_both_if_experimental: bool = False,
one_by_one: bool = False,
**search_kwargs,
) -> Union[pd.DataFrame, Tuple[pd.DataFrame, pd.DataFrame]]:
"""Retrieve MP data sorted by MPID (theoretical+exptl) or pub year (exptl).
See `*How do I do a time-split of Materials Project entries? e.g. pre-2018 vs.
post-2018* <https://matsci.org/t/42584>`_
Output ``DataFrame``-s will contain all specified `fields` unless ``fields is
None``, in which case all :func:`MPRester().summary.available_fields` will be
returned. If return experimental data, the additional fields of ``provenance``,
``discovery`` and ``year`` corresponding to
:func:`emmet.core.provenance.ProvenanceDoc`, a dictionary containing earliest year
and author information, and the earliest year, respectively, will also be returned.
Parameters
----------
api_key : Union[str, DEFAULT_API_KEY]
:func:`mp_api` API Key. On Windows, can set as an environment variable via:
``setx MP_API_KEY="abc123def456"``. By default:
:func:`mp_api.core.client.DEFAULT_API_KEY`
See also:
https://github.com/materialsproject/api/issues/566#issuecomment-1087941474
fields : Optional[List[str]]
fields (List[str]): List of fields to project. When searching, it is better to
only ask for the specific fields of interest to reduce the time taken to
retrieve the documents. See the :func:`MPRester().summary.available_fields`
property to see a list of fields to choose from. By default:
``["structure", "material_id", "theoretical"]``.
num_sites : Tuple[int, int]
Tuple of min and max number of sites used as filtering criteria, e.g. ``(1,
52)`` meaning at least ``1`` and no more than ``52`` sites. If ``None`` then no
compounds with any number of sites are allowed. By default None.
elements : List[str]
List of element symbols, e.g. ``["Ni", "Fe"]``. If ``None`` then all elements
are allowed. By default None.
exclude_elements : Optional[
Union[List[str], Literal["noble", "radioactive",
"noble+radioactive"]]
]
List of element symbols to _exclude_, e.g. ``["Ar", "Ne"]``. If ``None`` then
all elements are allowed. If a supported string value ("noble", "radioactive",
or "noble+radioactive"), then filters out the appropriate elements. By default
None.
use_theoretical : bool, optional
Whether to include both theoretical and experimental compounds or to filter down
to only experimentally-verified compounds, by default False
return_both_if_experimental : bool, optional
Whether to return both the full DataFrame containing theoretical+experimental
(`df`) and the experimental-only DataFrame (`expt_df`) or only `expt_df`, by
default False. This is only applicable if `use_theoretical` is False.
one_by_one: bool, optional
Whether to retrieve data one-by-one instead of in bulk. This is useful for
testing with a small number or in case the mp-api search is malfunctioning
(since need provenance attributes). By default False.
search_kwargs : dict, optional
kwargs: Supported search terms, e.g. nelements_max=3 for the "materials" search
API. Consult the specific API route for valid search terms,
i.e. :func:`MPRester().summary.available_fields`
Returns
-------
df : pd.DataFrame
if `use_theoretical` then returns a DataFrame containing both theoretical and
experimental compounds.
expt_df, df : Tuple[pd.DataFrame, pd.DataFrame]
if not `use_theoretical` and `return_both_if_experimental, then returns two
:func:`pd.DataFrame`-s containing theoretical+experimental and
experimental-only.
expt_df : pd.DataFrame
if not `use_theoretical` and not `return_both_if_experimental`, then returns a
:func:`pd.DataFrame` containing the experimental-only compounds.
Examples
--------
>>> api_key = "abc123def456"
>>> num_sites = (1, 52)
>>> elements = ["V"]
>>> expt_df = fetch_data(api_key, num_sites=num_sites, elements=elements)
>>> df = fetch_data(
api_key,
num_sites=num_sites,
elements=elements,
use_theoretical=True
)
>>> expt_df, df = fetch_data(
api_key,
num_sites=num_sites,
elements=elements,
use_theoretical=False,
return_both_if_experimental
)
"""
if fields is not None:
if "material_id" not in fields:
fields.append("material_id")
if not use_theoretical and "theoretical" not in fields:
fields.append("theoretical")
if exclude_elements is None:
excl_elems = None
elif isinstance(exclude_elements, str):
if exclude_elements not in AVAILABLE_EXCLUDE_STRS:
raise NotImplementedError(
f"Because str passed to `exclude_elements` instead of list of str, expected one of {AVAILABLE_EXCLUDE_STRS}" # noqa: E501
)
if exclude_elements == "noble":
excl_elems = noble
elif exclude_elements == "radioactive":
excl_elems = radioactive
elif exclude_elements == "noble+radioactive":
excl_elems = noble + radioactive
else:
excl_elems = exclude_elements
with MPRester(api_key) as mpr:
results = mpr.summary.search(
num_sites=num_sites,
elements=elements,
exclude_elements=excl_elems,
fields=fields,
**search_kwargs,
)
if fields is not None:
field_data = []
for r in results:
field_data.append({field: getattr(r, field) for field in fields})
else:
field_data = results
material_id = [str(fd["material_id"]) for fd in field_data]
# mvc values get distinguished by a negative sign
index = [
int(mid.replace("mp-", "").replace("mvc-", "-")) for mid in material_id
]
df = pd.DataFrame(field_data, index=index)
df = df.sort_index()
if not use_theoretical:
# REVIEW: whether to use MPID class or str of MPIDs?
# if latter, `expt_df.material_id.apply(str).tolist()`
expt_df = df.query("theoretical == False")
expt_material_id = expt_df.material_id.tolist()
if not one_by_one:
# https://github.com/materialsproject/api/issues/613
provenance_results = mpr.materials.provenance.search(
fields=["references", "material_id"]
)
provenance_ids = [fpr.material_id for fpr in provenance_results]
prov_df = pd.Series(
name="provenance", data=provenance_results, index=provenance_ids
)
expt_provenance_results = prov_df.loc[expt_material_id]
else:
# slow version
expt_provenance_results = [
mpr.materials.provenance.get_data_by_id(mid)
for mid in tqdm(expt_material_id)
]
# CrystalSystem not JSON serializable, see
# https://github.com/materialsproject/api/issues/615
# expt_df["provenance"] = expt_provenance_results
# extract earliest ICSD year
references = [pr.references for pr in expt_provenance_results]
discovery = get_discovery_dict(references)
year = [disc["year"] for disc in discovery]
# https://stackoverflow.com/a/35387129/13697228
expt_df = expt_df.assign(
references=references, discovery=discovery, year=year
)
expt_df = expt_df.sort_values(by=["year"])
if use_theoretical:
return df
elif return_both_if_experimental:
return expt_df, df
else:
return expt_df