Source code for networkx_temporal.generators.datasets.pubmed

import gzip
import os.path as osp
import urllib.request
from pathlib import Path
from typing import Optional, Union
import zipfile

import networkx as nx

from ...transform import from_static
from ...typing import TemporalDiGraph

DATA_PATH = Path(__file__).parent.resolve() / "pubmed"

DATA_URL = "https://zenodo.org/records/17860933/files/pubmed-features.csv.zip"



[docs]
def pubmed_graph(features: Optional[Union[bool, str]] = False) -> TemporalDiGraph:
    """ Returns the PubMed temporal graph.

    The PubMed [13]_ temporal [14]_ dataset is a citation network where nodes represent scientific
    papers in the PubMed database, and a directed edge from node :math:`u` to node :math:`v` at
    time :math:`t` indicates that paper :math:`u` cited paper :math:`v` at time :math:`t`.
    The dataset spans 19,717 papers and 44,335 citations over a period of 42 years, from 1967
    (:math:`t=0`) to 2010 (:math:`t=41`). The first cited paper is from 1964.

    Edges have a ``'time'`` attribute indicating the year the citation took place, starting from
    1967, while nodes have an associated ``'label'`` attribute representing the paper's research
    topic, among three possible classes
    If ``features`` is ``True``, nodes will have additional attributes corresponding to the
    TF-IDF scores of specific words in each paper's abstract. If the features file is not present
    in the specified ``root`` directory, it will be downloaded from a remote repository.

    .. rubric:: Example

    To load the dataset already sliced into yearly snapshots:

    .. code-block:: python

        >>> import networkx_temporal as tx
        >>>
        >>> TG = tx.generators.pubmed_graph()
        >>> print(TG)

        TemporalDiGraph (t=42) with 19717 nodes and 44335 edges

    .. [13] Namata et al. (2012).
        ''Query-driven Active Surveying for Collective Classification''.
        Workshop on Mining and Learning with Graphs (MLG), Edinburgh, Scotland, UK, 2012.
        url: `people.cs.vt.edu/~bhuang/papers/namata-mlg12.pdf
        <https://people.cs.vt.edu/~bhuang/papers/namata-mlg12.pdf>`__.

    .. [14] Passos, N.A.R.A., Carlini, E., Trani, S. (2024).
        ''Deep Community Detection in Attributed Temporal Graphs: Experimental Evaluation of
        Current Approaches''. In Proceedings of the 3rd GNNet Workshop: Graph Neural Networking
        Workshop. The 20th International Conference on emerging Networking EXperiments and
        Technologies (CoNEXT 2024), Los Angeles, CA, USA.
        doi: `10.1145/3694811.3697822 <https://doi.org/10.1145/3694811.3697822>`__.

    :param features: If ``True``, loads additional node features from file.
        Allows passing a string pointing to the directory where the `pubmed-features.csv.gz
        <https://zenodo.org/records/17860933/files/pubmed-features.csv.zip>`__
        file is located. If the file is not found, it will be downloaded automatically.
        Default is ``False``.

    :note: Dataset and files available from `Zenodo <https://doi.org/10.5281/zenodo.13932075>`__.
    """
    if features is not None and type(features) not in (bool, str):
        raise TypeError("Argument `features` must be a boolean or string.")

    with gzip.open(DATA_PATH / "pubmed-edges.csv.gz", "r") as f:
        G = nx.read_edgelist(f.readlines()[1:],
                             create_using=nx.DiGraph,
                             data=[("time", int)],
                             delimiter=",",
                             nodetype=str)

    with gzip.open(DATA_PATH / "pubmed-nodes.csv.gz", "rt") as f:
        nx.set_node_attributes(G,
                               {line.split(",")[0]: int(line.strip().split(",")[1])
                                for line in f.readlines()[1:]},
                               "label")

    if features:
        path = Path(features if type(features) == str else ".")
        if osp.isdir(path):
            path = path / "pubmed-features.csv.zip"

        # Download features file if it does not exist.
        if not zipfile.is_zipfile(path):
            _download_pubmed_features(path.resolve())

        # Load additional node features from file.
        # Skip first column (node ID): id,feat_0,feat_1,...
        # Each feature is a float value, zero if empty in file.
        with zipfile.ZipFile(path, "r") as zf:
            with zf.open("pubmed-features.csv", "r") as f:
                lines = f.read().decode("utf-8").splitlines()
                attrs = lines[0].strip().split(",")[1:]  # Skip the first column (node ID).
                node_attr = {
                    line.split(",")[0]: {
                        key: float(value if value else 0)
                        for key, value in zip(attrs, line.strip().split(",")[1:])
                    }
                    for line in lines[1:]  # Skip header.
                }
        nx.set_node_attributes(G, node_attr)

    TG = from_static(G)
    TG = TG.slice(attr="time")
    TG.name = "PubMed"
    return TG



def _download_pubmed_features(filepath: str) -> None:
    """ Downloads the PubMed node features file to root directory. """
    try:
        with open(filepath, "wb") as f:
            f.write(urllib.request.urlopen(DATA_URL, timeout=15).read())
    except Exception as e:
        raise RuntimeError(f"{e}: Failed to download PubMed features file from {DATA_URL}") from e