Source code for matbench_genmetrics.mp_time_split.utils.data

import re
from typing import List

import pybtex.errors
from pybtex.database.input import bibtex
from tqdm import tqdm

pybtex.errors.set_strict_mode(False)

SNAPSHOT_NAME = "mp_time_summary.json"
DUMMY_SNAPSHOT_NAME = "mp_dummy_time_summary.json"

noble = ["He", "Ar", "Ne", "Kr", "Xe", "Og", "Rn"]
# fmt: off
radioactive = ["U", "Th", "Ra", "Pu", "Po", "Rn", "Cm", "At", "Bk", "Fr", "Ac", "Am", "Bh", "Cf", "Np", "Ts", "Tc", "Md", "Lr", "Fm", "Hs", "Mt", "No", "Pm", "Rf", "Sg", "Ds", "Cn", "Rg", "Lv", "Og", "Fl", "Nh", "Db", "Es", "Mc", "Pa", "Bi", "Cs"]  # noqa: E501
# fmt: on


[docs] def get_discovery_dict(references: List[dict]) -> List[dict]: """Get a dictionary containing earliest bib info for each MP entry. Modified from source: "How do I do a time-split of Materials Project entries? e.g. pre-2018 vs. post-2018" https://matsci.org/t/42584/4?u=sgbaird, answer by @Joseph_Montoya, Materials Project Alumni Parameters ---------- provenance_results : List[dict] List of references results, e.g. taken from from the ``ProvenanceRester`` API results (:func:`mp_api.materials.provenance`) Returns ------- discovery, List[dict] Dictionary containing earliest bib info for each MP entry with keys: ``["year", "authors", "num_authors"]`` Examples -------- >>> with MPRester(api_key) as mpr: ... provenance_results = mpr.materials.provenance.search(num_sites=(1, 4), elements=["V"]) >>> discovery = get_discovery_dict(provenance_results) [{'year': 1963, 'authors': ['Raub, E.', 'Fritzsche, W.'], 'num_authors': 2}, {'year': 1925, 'authors': ['Becker, K.', 'Ebert, F.'], 'num_authors': 2}, {'year': 1965, 'authors': ['Giessen, B.C.', 'Grant, N.J.'], 'num_authors': 2}, {'year': 1957, 'authors': ['Philip, T.V.', 'Beck, P.A.'], 'num_authors': 2}, {'year': 1963, 'authors': ['Darby, J.B.jr.'], 'num_authors': 1}, {'year': 1977, 'authors': ['Aksenova, T.V.', 'Kuprina, V.V.', 'Bernard, V.B.', 'Skolozdra, R.V.'], 'num_authors': 4}, {'year': 1964, 'authors': ['Maldonado, A.', 'Schubert, K.'], 'num_authors': 2}, {'year': 1962, 'authors': ['Darby, J.B.jr.', 'Lam, D.J.', 'Norton, L.J.', 'Downey, J.W.'], 'num_authors': 4}, {'year': 1925, 'authors': ['Becker, K.', 'Ebert, F.'], 'num_authors': 2}, {'year': 1959, 'authors': ['Dwight, A.E.'], 'num_authors': 1}] # noqa: E501 """ discovery = [] for refs in tqdm(references): parser = bibtex.Parser() refs = "".join(refs) refs = parser.parse_string(refs) entries = refs.entries entries_by_year = [ (int(entry.fields["year"]), entry) for _, entry in entries.items() if "year" in entry.fields and re.match(r"\d{4}", entry.fields["year"]) ] if entries_by_year: entries_by_year = sorted(entries_by_year, key=lambda x: x[0]) first_report = { "year": entries_by_year[0][0], "authors": entries_by_year[0][1].persons["author"], } first_report["authors"] = [str(auth) for auth in first_report["authors"]] first_report["num_authors"] = len(first_report["authors"]) discovery.append(first_report) else: discovery.append(dict(year=None, authors=None, num_authors=None)) return discovery
# def encode_dataframe(df): # jsonpickle_pandas.register_handlers() # return jsonpickle.encode(df) # def decode_dataframe_from_string(string): # jsonpickle_pandas.register_handlers() # return jsonpickle.decode(string, classes=[Structure]) # %% Code graveyard