Source code for networkx_temporal.generators.datasets.pubmed

import gzip
import os.path as osp
import urllib.request
from pathlib import Path
from typing import Optional, Union
import zipfile

import networkx as nx

from ...transform import from_static
from ...typing import TemporalDiGraph

DATA_PATH = Path(__file__).parent.resolve() / "pubmed"

DATA_URL = "https://zenodo.org/records/17860933/files/pubmed-features.csv.zip"


[docs] def pubmed_graph(features: Optional[Union[bool, str]] = False) -> TemporalDiGraph: """ Returns the PubMed temporal graph. The PubMed [13]_ temporal [14]_ dataset is a citation network where nodes represent scientific papers in the PubMed database, and a directed edge from node :math:`u` to node :math:`v` at time :math:`t` indicates that paper :math:`u` cited paper :math:`v` at time :math:`t`. The dataset spans 19,717 papers and 44,335 citations over a period of 42 years, from 1967 (:math:`t=0`) to 2010 (:math:`t=41`). The first cited paper is from 1964. Edges have a ``'time'`` attribute indicating the year the citation took place, starting from 1967, while nodes have an associated ``'label'`` attribute representing the paper's research topic, among three possible classes If ``features`` is ``True``, nodes will have additional attributes corresponding to the TF-IDF scores of specific words in each paper's abstract. If the features file is not present in the specified ``root`` directory, it will be downloaded from a remote repository. .. rubric:: Example To load the dataset already sliced into yearly snapshots: .. code-block:: python >>> import networkx_temporal as tx >>> >>> TG = tx.generators.pubmed_graph() >>> print(TG) TemporalDiGraph (t=42) with 19717 nodes and 44335 edges .. [13] Namata et al. (2012). ''Query-driven Active Surveying for Collective Classification''. Workshop on Mining and Learning with Graphs (MLG), Edinburgh, Scotland, UK, 2012. url: `people.cs.vt.edu/~bhuang/papers/namata-mlg12.pdf <https://people.cs.vt.edu/~bhuang/papers/namata-mlg12.pdf>`__. .. [14] Passos, N.A.R.A., Carlini, E., Trani, S. (2024). ''Deep Community Detection in Attributed Temporal Graphs: Experimental Evaluation of Current Approaches''. In Proceedings of the 3rd GNNet Workshop: Graph Neural Networking Workshop. The 20th International Conference on emerging Networking EXperiments and Technologies (CoNEXT 2024), Los Angeles, CA, USA. doi: `10.1145/3694811.3697822 <https://doi.org/10.1145/3694811.3697822>`__. :param features: If ``True``, loads additional node features from file. Allows passing a string pointing to the directory where the `pubmed-features.csv.gz <https://zenodo.org/records/17860933/files/pubmed-features.csv.zip>`__ file is located. If the file is not found, it will be downloaded automatically. Default is ``False``. :note: Dataset and files available from `Zenodo <https://doi.org/10.5281/zenodo.13932075>`__. """ if features is not None and type(features) not in (bool, str): raise TypeError("Argument `features` must be a boolean or string.") with gzip.open(DATA_PATH / "pubmed-edges.csv.gz", "r") as f: G = nx.read_edgelist(f.readlines()[1:], create_using=nx.DiGraph, data=[("time", int)], delimiter=",", nodetype=str) with gzip.open(DATA_PATH / "pubmed-nodes.csv.gz", "rt") as f: nx.set_node_attributes(G, {line.split(",")[0]: int(line.strip().split(",")[1]) for line in f.readlines()[1:]}, "label") if features: path = Path(features if type(features) == str else ".") if osp.isdir(path): path = path / "pubmed-features.csv.zip" # Download features file if it does not exist. if not zipfile.is_zipfile(path): _download_pubmed_features(path.resolve()) # Load additional node features from file. # Skip first column (node ID): id,feat_0,feat_1,... # Each feature is a float value, zero if empty in file. with zipfile.ZipFile(path, "r") as zf: with zf.open("pubmed-features.csv", "r") as f: lines = f.read().decode("utf-8").splitlines() attrs = lines[0].strip().split(",")[1:] # Skip the first column (node ID). node_attr = { line.split(",")[0]: { key: float(value if value else 0) for key, value in zip(attrs, line.strip().split(",")[1:]) } for line in lines[1:] # Skip header. } nx.set_node_attributes(G, node_attr) TG = from_static(G) TG = TG.slice(attr="time") TG.name = "PubMed" return TG
def _download_pubmed_features(filepath: str) -> None: """ Downloads the PubMed node features file to root directory. """ try: with open(filepath, "wb") as f: f.write(urllib.request.urlopen(DATA_URL, timeout=15).read()) except Exception as e: raise RuntimeError(f"{e}: Failed to download PubMed features file from {DATA_URL}") from e