Source code for matbench_genmetrics.mp_time_split.utils.api

from typing import List, Optional, Tuple, Union

import pandas as pd

try:
    from mp_api.client import MPRester
except Exception as e:
    print(e)
    print("Falling back to from mp_api import MPRester")
    from mp_api import MPRester


from mp_api.client.core.client import DEFAULT_API_KEY
from tqdm import tqdm
from typing_extensions import Literal

from matbench_genmetrics.mp_time_split.utils.data import (
    get_discovery_dict,
    noble,
    radioactive,
)

# ensure match between following and `Literal` type hint for `exclude_elements`
AVAILABLE_EXCLUDE_STRS = ["noble", "radioactive", "noble+radioactive"]


[docs] def fetch_data( api_key: Union[str, DEFAULT_API_KEY] = DEFAULT_API_KEY, fields: Optional[List[str]] = [ "structure", "material_id", "theoretical", "energy_above_hull", "formation_energy_per_atom", ], num_sites: Optional[Tuple[int, int]] = None, elements: Optional[List[str]] = None, exclude_elements: Optional[ Union[List[str], Literal["noble", "radioactive", "noble+radioactive"]] ] = None, use_theoretical: bool = False, return_both_if_experimental: bool = False, one_by_one: bool = False, **search_kwargs, ) -> Union[pd.DataFrame, Tuple[pd.DataFrame, pd.DataFrame]]: """Retrieve MP data sorted by MPID (theoretical+exptl) or pub year (exptl). See `*How do I do a time-split of Materials Project entries? e.g. pre-2018 vs. post-2018* <https://matsci.org/t/42584>`_ Output ``DataFrame``-s will contain all specified `fields` unless ``fields is None``, in which case all :func:`MPRester().summary.available_fields` will be returned. If return experimental data, the additional fields of ``provenance``, ``discovery`` and ``year`` corresponding to :func:`emmet.core.provenance.ProvenanceDoc`, a dictionary containing earliest year and author information, and the earliest year, respectively, will also be returned. Parameters ---------- api_key : Union[str, DEFAULT_API_KEY] :func:`mp_api` API Key. On Windows, can set as an environment variable via: ``setx MP_API_KEY="abc123def456"``. By default: :func:`mp_api.core.client.DEFAULT_API_KEY` See also: https://github.com/materialsproject/api/issues/566#issuecomment-1087941474 fields : Optional[List[str]] fields (List[str]): List of fields to project. When searching, it is better to only ask for the specific fields of interest to reduce the time taken to retrieve the documents. See the :func:`MPRester().summary.available_fields` property to see a list of fields to choose from. By default: ``["structure", "material_id", "theoretical"]``. num_sites : Tuple[int, int] Tuple of min and max number of sites used as filtering criteria, e.g. ``(1, 52)`` meaning at least ``1`` and no more than ``52`` sites. If ``None`` then no compounds with any number of sites are allowed. By default None. elements : List[str] List of element symbols, e.g. ``["Ni", "Fe"]``. If ``None`` then all elements are allowed. By default None. exclude_elements : Optional[ Union[List[str], Literal["noble", "radioactive", "noble+radioactive"]] ] List of element symbols to _exclude_, e.g. ``["Ar", "Ne"]``. If ``None`` then all elements are allowed. If a supported string value ("noble", "radioactive", or "noble+radioactive"), then filters out the appropriate elements. By default None. use_theoretical : bool, optional Whether to include both theoretical and experimental compounds or to filter down to only experimentally-verified compounds, by default False return_both_if_experimental : bool, optional Whether to return both the full DataFrame containing theoretical+experimental (`df`) and the experimental-only DataFrame (`expt_df`) or only `expt_df`, by default False. This is only applicable if `use_theoretical` is False. one_by_one: bool, optional Whether to retrieve data one-by-one instead of in bulk. This is useful for testing with a small number or in case the mp-api search is malfunctioning (since need provenance attributes). By default False. search_kwargs : dict, optional kwargs: Supported search terms, e.g. nelements_max=3 for the "materials" search API. Consult the specific API route for valid search terms, i.e. :func:`MPRester().summary.available_fields` Returns ------- df : pd.DataFrame if `use_theoretical` then returns a DataFrame containing both theoretical and experimental compounds. expt_df, df : Tuple[pd.DataFrame, pd.DataFrame] if not `use_theoretical` and `return_both_if_experimental, then returns two :func:`pd.DataFrame`-s containing theoretical+experimental and experimental-only. expt_df : pd.DataFrame if not `use_theoretical` and not `return_both_if_experimental`, then returns a :func:`pd.DataFrame` containing the experimental-only compounds. Examples -------- >>> api_key = "abc123def456" >>> num_sites = (1, 52) >>> elements = ["V"] >>> expt_df = fetch_data(api_key, num_sites=num_sites, elements=elements) >>> df = fetch_data( api_key, num_sites=num_sites, elements=elements, use_theoretical=True ) >>> expt_df, df = fetch_data( api_key, num_sites=num_sites, elements=elements, use_theoretical=False, return_both_if_experimental ) """ if fields is not None: if "material_id" not in fields: fields.append("material_id") if not use_theoretical and "theoretical" not in fields: fields.append("theoretical") if exclude_elements is None: excl_elems = None elif isinstance(exclude_elements, str): if exclude_elements not in AVAILABLE_EXCLUDE_STRS: raise NotImplementedError( f"Because str passed to `exclude_elements` instead of list of str, expected one of {AVAILABLE_EXCLUDE_STRS}" # noqa: E501 ) if exclude_elements == "noble": excl_elems = noble elif exclude_elements == "radioactive": excl_elems = radioactive elif exclude_elements == "noble+radioactive": excl_elems = noble + radioactive else: excl_elems = exclude_elements with MPRester(api_key) as mpr: results = mpr.summary.search( num_sites=num_sites, elements=elements, exclude_elements=excl_elems, fields=fields, **search_kwargs, ) if fields is not None: field_data = [] for r in results: field_data.append({field: getattr(r, field) for field in fields}) else: field_data = results material_id = [str(fd["material_id"]) for fd in field_data] # mvc values get distinguished by a negative sign index = [ int(mid.replace("mp-", "").replace("mvc-", "-")) for mid in material_id ] df = pd.DataFrame(field_data, index=index) df = df.sort_index() if not use_theoretical: # REVIEW: whether to use MPID class or str of MPIDs? # if latter, `expt_df.material_id.apply(str).tolist()` expt_df = df.query("theoretical == False") expt_material_id = expt_df.material_id.tolist() if not one_by_one: # https://github.com/materialsproject/api/issues/613 provenance_results = mpr.materials.provenance.search( fields=["references", "material_id"] ) provenance_ids = [fpr.material_id for fpr in provenance_results] prov_df = pd.Series( name="provenance", data=provenance_results, index=provenance_ids ) expt_provenance_results = prov_df.loc[expt_material_id] else: # slow version expt_provenance_results = [ mpr.materials.provenance.get_data_by_id(mid) for mid in tqdm(expt_material_id) ] # CrystalSystem not JSON serializable, see # https://github.com/materialsproject/api/issues/615 # expt_df["provenance"] = expt_provenance_results # extract earliest ICSD year references = [pr.references for pr in expt_provenance_results] discovery = get_discovery_dict(references) year = [disc["year"] for disc in discovery] # https://stackoverflow.com/a/35387129/13697228 expt_df = expt_df.assign( references=references, discovery=discovery, year=year ) expt_df = expt_df.sort_values(by=["year"]) if use_theoretical: return df elif return_both_if_experimental: return expt_df, df else: return expt_df