Source code for matbench_genmetrics.mp_time_split.utils.data

import re
from typing import List

import pybtex.errors
from pybtex.database.input import bibtex
from tqdm import tqdm

pybtex.errors.set_strict_mode(False)

SNAPSHOT_NAME = "mp_time_summary.json"
DUMMY_SNAPSHOT_NAME = "mp_dummy_time_summary.json"

noble = ["He", "Ar", "Ne", "Kr", "Xe", "Og", "Rn"]
# fmt: off
radioactive = ["U", "Th", "Ra", "Pu", "Po", "Rn", "Cm", "At", "Bk", "Fr", "Ac", "Am", "Bh", "Cf", "Np", "Ts", "Tc", "Md", "Lr", "Fm", "Hs", "Mt", "No", "Pm", "Rf", "Sg", "Ds", "Cn", "Rg", "Lv", "Og", "Fl", "Nh", "Db", "Es", "Mc", "Pa", "Bi", "Cs"]  # noqa: E501
# fmt: on



[docs]
def get_discovery_dict(references: List[dict]) -> List[dict]:
    """Get a dictionary containing earliest bib info for each MP entry.

    Modified from source:
    "How do I do a time-split of Materials Project entries? e.g. pre-2018 vs. post-2018"
    https://matsci.org/t/42584/4?u=sgbaird, answer by @Joseph_Montoya, Materials Project Alumni

    Parameters
    ----------
    provenance_results : List[dict]
        List of references results, e.g. taken from from the ``ProvenanceRester`` API
        results (:func:`mp_api.materials.provenance`)

    Returns
    -------
    discovery, List[dict]
        Dictionary containing earliest bib info for each MP entry with keys: ``["year",
        "authors", "num_authors"]``

    Examples
    --------
    >>> with MPRester(api_key) as mpr:
    ...     provenance_results = mpr.materials.provenance.search(num_sites=(1, 4), elements=["V"])
    >>> discovery = get_discovery_dict(provenance_results)
    [{'year': 1963, 'authors': ['Raub, E.', 'Fritzsche, W.'], 'num_authors': 2}, {'year': 1925, 'authors': ['Becker, K.', 'Ebert, F.'], 'num_authors': 2}, {'year': 1965, 'authors': ['Giessen, B.C.', 'Grant, N.J.'], 'num_authors': 2}, {'year': 1957, 'authors': ['Philip, T.V.', 'Beck, P.A.'], 'num_authors': 2}, {'year': 1963, 'authors': ['Darby, J.B.jr.'], 'num_authors': 1}, {'year': 1977, 'authors': ['Aksenova, T.V.', 'Kuprina, V.V.', 'Bernard, V.B.', 'Skolozdra, R.V.'], 'num_authors': 4}, {'year': 1964, 'authors': ['Maldonado, A.', 'Schubert, K.'], 'num_authors': 2}, {'year': 1962, 'authors': ['Darby, J.B.jr.', 'Lam, D.J.', 'Norton, L.J.', 'Downey, J.W.'], 'num_authors': 4}, {'year': 1925, 'authors': ['Becker, K.', 'Ebert, F.'], 'num_authors': 2}, {'year': 1959, 'authors': ['Dwight, A.E.'], 'num_authors': 1}] # noqa: E501
    """
    discovery = []
    for refs in tqdm(references):
        parser = bibtex.Parser()
        refs = "".join(refs)
        refs = parser.parse_string(refs)
        entries = refs.entries
        entries_by_year = [
            (int(entry.fields["year"]), entry)
            for _, entry in entries.items()
            if "year" in entry.fields and re.match(r"\d{4}", entry.fields["year"])
        ]
        if entries_by_year:
            entries_by_year = sorted(entries_by_year, key=lambda x: x[0])
            first_report = {
                "year": entries_by_year[0][0],
                "authors": entries_by_year[0][1].persons["author"],
            }
            first_report["authors"] = [str(auth) for auth in first_report["authors"]]
            first_report["num_authors"] = len(first_report["authors"])
            discovery.append(first_report)
        else:
            discovery.append(dict(year=None, authors=None, num_authors=None))
    return discovery



# def encode_dataframe(df):
#     jsonpickle_pandas.register_handlers()
#     return jsonpickle.encode(df)


# def decode_dataframe_from_string(string):
#     jsonpickle_pandas.register_handlers()
#     return jsonpickle.decode(string, classes=[Structure])


# %% Code graveyard