Module `langbrainscore.dataset.dataset`

Expand source code

# stdlib imports
import typing
from collections import abc
from pathlib import Path

import numpy as np
import xarray as xr
from joblib import Parallel, delayed
from langbrainscore.interface import _Dataset
from langbrainscore.utils.logging import log
from langbrainscore.utils.xarray import collapse_multidim_coord
from tqdm import tqdm


class Dataset(_Dataset):
    @property
    def contents(self) -> xr.DataArray:
        """
        access the internal xarray object. use with caution.
        """
        return self._xr_obj

    @property
    def stimuli(self) -> xr.DataArray:
        """
        getter method that returns an xarray object of stimuli and associated metadata

        Returns:
            xr.DataArray: xarray object containing the stimuli from the dataset and associated metadata
        """
        return self.contents.stimulus

    @property
    def dims(self) -> tuple:
        """
        getter method that returns internal xarray dimensions

        Returns:
            tuple[str]: dimensions of internal xarray object
        """
        return self.contents.dims

    def to_netcdf(self, filename):
        """
        outputs the xarray.DataArray object to a netCDF file identified by
        `filename`. if it already exists, overwrites it.
        """
        if Path(filename).expanduser().resolve().exists():
            log(f"{filename} already exists. overwriting.", type="WARN")
        self._xr_obj.to_netcdf(filename)

    @classmethod
    def load_netcdf(cls, filename):
        """
        loads a netCDF object that contains a pre-packaged xarray instance from
        a file at `filename`.
        """
        return cls(xr.load_dataarray(filename))

    @classmethod
    def from_file_or_url(
        cls,
        file_path_or_url: typing.Union[str, Path],
        data_column: str,
        sampleid_index: str,
        neuroid_index: str,
        stimuli_index: str,
        timeid_index: str = None,
        subject_index: str = None,
        sampleid_metadata: typing.Union[
            typing.Iterable[str], typing.Mapping[str, str]
        ] = None,
        neuroid_metadata: typing.Union[
            typing.Iterable[str], typing.Mapping[str, str]
        ] = None,
        timeid_metadata: typing.Union[
            typing.Iterable[str], typing.Mapping[str, str]
        ] = None,
        multidim_metadata: typing.Iterable[
            typing.Mapping[str, typing.Iterable[str]]
        ] = None,
        sort_by: typing.Iterable[str] = (),
        sep=",",
        parallel: int = -2,
    ) -> _Dataset:
        """Creates a Dataset object holding an `xr.DataArray` instance using a CSV file readable by pandas.
            Constructs the `xr.DataArray` using specified columns to construct dimensions and
            metadata along those dimensions in the form of coordinates.
            Minimally requires `sampleid` and `neuroid` to be provided.

            Note: Each row of the supplied file must have a single data point corresponding
            to a unique `sampleid`, `neuroid`, and `timeid` (unique dimension values).
            I.e., each neuroid (which could be a voxel, an ROI, a reaction time RT value, etc.)
            must be on a new line for the same stimulus trial at a certain time.
            If `timeid` and `subjectid` is not provided:
                - a singleton timeid dimension is created with the value "0" for each sample.
                - a singleton subjectid dimension is created with value "0" that spans the entire data.
            For help on what these terms mean, please visit the
            [xarray glossary page](https://xarray.pydata.org/en/stable/user-guide/terminology.html)


        Args:
            file_path_or_url (typing.Union[str, Path]): a path or URL to a csv file
            data_column (str): title of the column that holds the datapoints per unit of measurement
                (e.g., BOLD contrast effect size, reaction time, voltage amplitude, etc)
            sampleid_index (str): title of the column that should be used to construct an index for sampleids.
                this should be unique for each stimulus in the dataset.
            neuroid_index (str): title of the column that should be used to construct an index for neuroids.
                this should be unique for each point of measurement within a subject. e.g., voxel1, voxel2, ...
                neuroids in the packaged dataset are transformed to be a product of subject_index and neuroid_index.
            stimuli_index (str): title of the column that holds stimuli shown to participants
            timeid_index (str, optional): title of the column that holds timepoints of stimulus presentation.
                optional. if not provided, a singleton timepoint '0' is assigned to each datapoint. Defaults to None.
            subject_index (str, optional): title of the column specifiying subject IDs. Defaults to None.
            sampleid_metadata (typing.Union[typing.Iterable[str], typing.Mapping[str,str]], optional):
                names of columns (and optionally mapping of existing column names to new coordinate names)
                that supply metadata along the sampleid dimension. Defaults to None.
            neuroid_metadata (typing.Union[typing.Iterable[str], typing.Mapping[str,str]], optional):
                see `sampleid_metadata`. Defaults to None.
            timeid_metadata (typing.Union[typing.Iterable[str], typing.Mapping[str,str]], optional):
                see `sampleid_metadata`. Defaults to None.
            multidim_metadata (typing.Iterable[typing.Mapping[str, typing.Iterable[str]]], optional):
                metadata to go with more than one dimension. e.g., chunks of a stimulus that unfolds with time.
                currently `NotImplemented`. Defaults to None.
            sort_by (typing.Iterable[str], optional): Sort data by these columns while repackaging it.
                data is sorted by `sampleid_index`, `neuroid_index`, and `timeid_index` in addition to this
                value. Defaults to ().
            sep (str, optional): separator to read a value-delimited file. this argument is passed to pandas.
                Defaults to ','.

        Raises:
            ValueError: _description_

        Returns:
            _Dataset: a subclass of the `_Dataset` interface with the packaged xarray.DataArray as a member.
        """

        T = typing.TypeVar("T")

        def collapse_same_value(arr: typing.Iterable[T]) -> T:
            """
            makes sure each element in an iterable is identical (using __eq__)
            to every other element by value and returns (any) one of the elements.
            if a non-identical element (!=) is found, raises ValueError
            """
            try:
                first_thing = next(iter(arr))
            except StopIteration:
                log(f"failed to obtain value from {arr}", verbosity_check=True)
                return np.nan
            for each_thing in arr:
                if first_thing != each_thing:
                    raise ValueError(f"{first_thing} != {each_thing}")
            return first_thing

        import pandas as pd

        if str(file_path_or_url).endswith(".parquet.gzip"):
            try:
                df = pd.read_parquet(file_path_or_url)
            except Exception as invalid_file:
                raise ValueError("invalid parquet file / filename") from invalid_file
        else:
            try:
                df = pd.read_csv(file_path_or_url, sep=sep)
            except Exception as invalid_file:
                raise ValueError("invalid csv file / filename") from invalid_file

        if timeid_index is None:
            timeid_index = "timeid"
            # create singleton timeid
            # we don't need to inflate data since each datapoint will just
            # correspond to timeid == 0 per sample
            timeid_column = [0] * len(df)
            df[timeid_index] = timeid_column
        if subject_index is None:
            subject_index = "subject"
            # create singleton subjectID
            # we don't need to inflate data since each datapoint will just
            # correspond to subject == 0 per sample
            subject_column = ["subject0"] * len(df)
            df[subject_index] = subject_column
        if not parallel:
            parallel = 1

        subjects = list(set(df[subject_index]))
        sampleids = list(
            set(df[sampleid_index])
        )  # what happens when the same stimulus is shown multiple times?
        # it will add entries with same sampleid that will have to
        # then be differentiated on the basis of metadata only
        # https://i.imgur.com/4V2DsIo.png
        neuroids = list(set(df[neuroid_index]))
        timeids = list(set(df[timeid_index]))

        if not isinstance(sampleid_metadata, abc.Mapping):
            sampleid_metadata = {k: k for k in sampleid_metadata}
        if not isinstance(neuroid_metadata, abc.Mapping):
            neuroid_metadata = {k: k for k in neuroid_metadata}
        if not isinstance(timeid_metadata, abc.Mapping):
            timeid_metadata = {k: k for k in timeid_metadata or ()}

        df = df.sort_values(
            [*sort_by, subject_index, sampleid_index, neuroid_index, timeid_index]
        )

        def get_sampleid_xr(sampleid):
            sample_view = df[df[sampleid_index] == sampleid]  # why not sampleid_view?

            neuroid_xrs = []
            for neuroid in neuroids:
                neuroid_view = sample_view[sample_view[neuroid_index] == neuroid]

                timeid_xrs = []
                for timeid in timeids:
                    timeid_view = neuroid_view[neuroid_view[timeid_index] == timeid]
                    data = timeid_view[data_column].values
                    timeid_xr = xr.DataArray(
                        data.reshape(1, len(timeid_view[subject_index]), 1),
                        dims=("sampleid", "neuroid", "timeid"),
                        coords={
                            "sampleid": np.repeat(sampleid, 1),
                            "neuroid": [
                                f"{a}_{b}"
                                for a, b in zip(
                                    timeid_view[subject_index],
                                    timeid_view[neuroid_index],
                                )
                            ],
                            "timeid": np.repeat(timeid, 1),
                            "subject": ("neuroid", timeid_view[subject_index]),
                            "stimulus": (
                                "sampleid",
                                [collapse_same_value(timeid_view[stimuli_index])],
                            ),
                            **{
                                metadata_names[column]: (
                                    dimension,
                                    [collapse_same_value(timeid_view[column])],
                                )
                                for dimension, metadata_names in (
                                    ("sampleid", sampleid_metadata),
                                    ("timeid", timeid_metadata),
                                )
                                for column in metadata_names
                            },
                            **{
                                neuroid_metadata[column]: (
                                    "neuroid",
                                    (timeid_view[column]),
                                )
                                for column in neuroid_metadata
                            },
                        },
                    )
                    timeid_xrs += [timeid_xr]

                neuroid_xr = xr.concat(timeid_xrs, dim="timeid")
                neuroid_xrs += [neuroid_xr]

            sampleid_xr = xr.concat(neuroid_xrs, dim="neuroid")
            return sampleid_xr

        sampleid_xrs = Parallel(n_jobs=parallel)(
            delayed(get_sampleid_xr)(sampleid)
            for sampleid in tqdm(sampleids, desc="reassembling data per sampleid")
        )

        unified_xr = xr.concat(sampleid_xrs, dim="sampleid")

        for dimension, metadata_names in (
            ("sampleid", {**sampleid_metadata, "stimulus": "stimulus"}),
            ("timeid", timeid_metadata),
            ("neuroid", {**neuroid_metadata, "subject": "subject"}),
        ):
            for column in metadata_names:
                try:
                    unified_xr = collapse_multidim_coord(
                        unified_xr, metadata_names[column], dimension
                    )
                except ValueError as e:
                    log(
                        f"dimension:{dimension}, column:{column}, shape:{unified_xr[metadata_names[column]].shape}",
                        type="ERR",
                    )

        return cls(unified_xr)  # NOTE: we use `cls` rather than `Dataset` so any
        # subclasses will use the subclass rather than parent

Classes

class Dataset (xr_obj: xarray.core.dataarray.DataArray, dataset_name: str = None)

wrapper class for xarray DataArray that confirms format adheres to interface.

accepts an xarray with the following core dimensions: sampleid, neuroid, timeid and at least the following core coordinates: sampleid, neuroid, timeid, stimulus, subject

Args

xr_obj : xr.DataArray: xarray object with core dimensions and coordinates

Expand source code

class Dataset(_Dataset):
    @property
    def contents(self) -> xr.DataArray:
        """
        access the internal xarray object. use with caution.
        """
        return self._xr_obj

    @property
    def stimuli(self) -> xr.DataArray:
        """
        getter method that returns an xarray object of stimuli and associated metadata

        Returns:
            xr.DataArray: xarray object containing the stimuli from the dataset and associated metadata
        """
        return self.contents.stimulus

    @property
    def dims(self) -> tuple:
        """
        getter method that returns internal xarray dimensions

        Returns:
            tuple[str]: dimensions of internal xarray object
        """
        return self.contents.dims

    def to_netcdf(self, filename):
        """
        outputs the xarray.DataArray object to a netCDF file identified by
        `filename`. if it already exists, overwrites it.
        """
        if Path(filename).expanduser().resolve().exists():
            log(f"{filename} already exists. overwriting.", type="WARN")
        self._xr_obj.to_netcdf(filename)

    @classmethod
    def load_netcdf(cls, filename):
        """
        loads a netCDF object that contains a pre-packaged xarray instance from
        a file at `filename`.
        """
        return cls(xr.load_dataarray(filename))

    @classmethod
    def from_file_or_url(
        cls,
        file_path_or_url: typing.Union[str, Path],
        data_column: str,
        sampleid_index: str,
        neuroid_index: str,
        stimuli_index: str,
        timeid_index: str = None,
        subject_index: str = None,
        sampleid_metadata: typing.Union[
            typing.Iterable[str], typing.Mapping[str, str]
        ] = None,
        neuroid_metadata: typing.Union[
            typing.Iterable[str], typing.Mapping[str, str]
        ] = None,
        timeid_metadata: typing.Union[
            typing.Iterable[str], typing.Mapping[str, str]
        ] = None,
        multidim_metadata: typing.Iterable[
            typing.Mapping[str, typing.Iterable[str]]
        ] = None,
        sort_by: typing.Iterable[str] = (),
        sep=",",
        parallel: int = -2,
    ) -> _Dataset:
        """Creates a Dataset object holding an `xr.DataArray` instance using a CSV file readable by pandas.
            Constructs the `xr.DataArray` using specified columns to construct dimensions and
            metadata along those dimensions in the form of coordinates.
            Minimally requires `sampleid` and `neuroid` to be provided.

            Note: Each row of the supplied file must have a single data point corresponding
            to a unique `sampleid`, `neuroid`, and `timeid` (unique dimension values).
            I.e., each neuroid (which could be a voxel, an ROI, a reaction time RT value, etc.)
            must be on a new line for the same stimulus trial at a certain time.
            If `timeid` and `subjectid` is not provided:
                - a singleton timeid dimension is created with the value "0" for each sample.
                - a singleton subjectid dimension is created with value "0" that spans the entire data.
            For help on what these terms mean, please visit the
            [xarray glossary page](https://xarray.pydata.org/en/stable/user-guide/terminology.html)


        Args:
            file_path_or_url (typing.Union[str, Path]): a path or URL to a csv file
            data_column (str): title of the column that holds the datapoints per unit of measurement
                (e.g., BOLD contrast effect size, reaction time, voltage amplitude, etc)
            sampleid_index (str): title of the column that should be used to construct an index for sampleids.
                this should be unique for each stimulus in the dataset.
            neuroid_index (str): title of the column that should be used to construct an index for neuroids.
                this should be unique for each point of measurement within a subject. e.g., voxel1, voxel2, ...
                neuroids in the packaged dataset are transformed to be a product of subject_index and neuroid_index.
            stimuli_index (str): title of the column that holds stimuli shown to participants
            timeid_index (str, optional): title of the column that holds timepoints of stimulus presentation.
                optional. if not provided, a singleton timepoint '0' is assigned to each datapoint. Defaults to None.
            subject_index (str, optional): title of the column specifiying subject IDs. Defaults to None.
            sampleid_metadata (typing.Union[typing.Iterable[str], typing.Mapping[str,str]], optional):
                names of columns (and optionally mapping of existing column names to new coordinate names)
                that supply metadata along the sampleid dimension. Defaults to None.
            neuroid_metadata (typing.Union[typing.Iterable[str], typing.Mapping[str,str]], optional):
                see `sampleid_metadata`. Defaults to None.
            timeid_metadata (typing.Union[typing.Iterable[str], typing.Mapping[str,str]], optional):
                see `sampleid_metadata`. Defaults to None.
            multidim_metadata (typing.Iterable[typing.Mapping[str, typing.Iterable[str]]], optional):
                metadata to go with more than one dimension. e.g., chunks of a stimulus that unfolds with time.
                currently `NotImplemented`. Defaults to None.
            sort_by (typing.Iterable[str], optional): Sort data by these columns while repackaging it.
                data is sorted by `sampleid_index`, `neuroid_index`, and `timeid_index` in addition to this
                value. Defaults to ().
            sep (str, optional): separator to read a value-delimited file. this argument is passed to pandas.
                Defaults to ','.

        Raises:
            ValueError: _description_

        Returns:
            _Dataset: a subclass of the `_Dataset` interface with the packaged xarray.DataArray as a member.
        """

        T = typing.TypeVar("T")

        def collapse_same_value(arr: typing.Iterable[T]) -> T:
            """
            makes sure each element in an iterable is identical (using __eq__)
            to every other element by value and returns (any) one of the elements.
            if a non-identical element (!=) is found, raises ValueError
            """
            try:
                first_thing = next(iter(arr))
            except StopIteration:
                log(f"failed to obtain value from {arr}", verbosity_check=True)
                return np.nan
            for each_thing in arr:
                if first_thing != each_thing:
                    raise ValueError(f"{first_thing} != {each_thing}")
            return first_thing

        import pandas as pd

        if str(file_path_or_url).endswith(".parquet.gzip"):
            try:
                df = pd.read_parquet(file_path_or_url)
            except Exception as invalid_file:
                raise ValueError("invalid parquet file / filename") from invalid_file
        else:
            try:
                df = pd.read_csv(file_path_or_url, sep=sep)
            except Exception as invalid_file:
                raise ValueError("invalid csv file / filename") from invalid_file

        if timeid_index is None:
            timeid_index = "timeid"
            # create singleton timeid
            # we don't need to inflate data since each datapoint will just
            # correspond to timeid == 0 per sample
            timeid_column = [0] * len(df)
            df[timeid_index] = timeid_column
        if subject_index is None:
            subject_index = "subject"
            # create singleton subjectID
            # we don't need to inflate data since each datapoint will just
            # correspond to subject == 0 per sample
            subject_column = ["subject0"] * len(df)
            df[subject_index] = subject_column
        if not parallel:
            parallel = 1

        subjects = list(set(df[subject_index]))
        sampleids = list(
            set(df[sampleid_index])
        )  # what happens when the same stimulus is shown multiple times?
        # it will add entries with same sampleid that will have to
        # then be differentiated on the basis of metadata only
        # https://i.imgur.com/4V2DsIo.png
        neuroids = list(set(df[neuroid_index]))
        timeids = list(set(df[timeid_index]))

        if not isinstance(sampleid_metadata, abc.Mapping):
            sampleid_metadata = {k: k for k in sampleid_metadata}
        if not isinstance(neuroid_metadata, abc.Mapping):
            neuroid_metadata = {k: k for k in neuroid_metadata}
        if not isinstance(timeid_metadata, abc.Mapping):
            timeid_metadata = {k: k for k in timeid_metadata or ()}

        df = df.sort_values(
            [*sort_by, subject_index, sampleid_index, neuroid_index, timeid_index]
        )

        def get_sampleid_xr(sampleid):
            sample_view = df[df[sampleid_index] == sampleid]  # why not sampleid_view?

            neuroid_xrs = []
            for neuroid in neuroids:
                neuroid_view = sample_view[sample_view[neuroid_index] == neuroid]

                timeid_xrs = []
                for timeid in timeids:
                    timeid_view = neuroid_view[neuroid_view[timeid_index] == timeid]
                    data = timeid_view[data_column].values
                    timeid_xr = xr.DataArray(
                        data.reshape(1, len(timeid_view[subject_index]), 1),
                        dims=("sampleid", "neuroid", "timeid"),
                        coords={
                            "sampleid": np.repeat(sampleid, 1),
                            "neuroid": [
                                f"{a}_{b}"
                                for a, b in zip(
                                    timeid_view[subject_index],
                                    timeid_view[neuroid_index],
                                )
                            ],
                            "timeid": np.repeat(timeid, 1),
                            "subject": ("neuroid", timeid_view[subject_index]),
                            "stimulus": (
                                "sampleid",
                                [collapse_same_value(timeid_view[stimuli_index])],
                            ),
                            **{
                                metadata_names[column]: (
                                    dimension,
                                    [collapse_same_value(timeid_view[column])],
                                )
                                for dimension, metadata_names in (
                                    ("sampleid", sampleid_metadata),
                                    ("timeid", timeid_metadata),
                                )
                                for column in metadata_names
                            },
                            **{
                                neuroid_metadata[column]: (
                                    "neuroid",
                                    (timeid_view[column]),
                                )
                                for column in neuroid_metadata
                            },
                        },
                    )
                    timeid_xrs += [timeid_xr]

                neuroid_xr = xr.concat(timeid_xrs, dim="timeid")
                neuroid_xrs += [neuroid_xr]

            sampleid_xr = xr.concat(neuroid_xrs, dim="neuroid")
            return sampleid_xr

        sampleid_xrs = Parallel(n_jobs=parallel)(
            delayed(get_sampleid_xr)(sampleid)
            for sampleid in tqdm(sampleids, desc="reassembling data per sampleid")
        )

        unified_xr = xr.concat(sampleid_xrs, dim="sampleid")

        for dimension, metadata_names in (
            ("sampleid", {**sampleid_metadata, "stimulus": "stimulus"}),
            ("timeid", timeid_metadata),
            ("neuroid", {**neuroid_metadata, "subject": "subject"}),
        ):
            for column in metadata_names:
                try:
                    unified_xr = collapse_multidim_coord(
                        unified_xr, metadata_names[column], dimension
                    )
                except ValueError as e:
                    log(
                        f"dimension:{dimension}, column:{column}, shape:{unified_xr[metadata_names[column]].shape}",
                        type="ERR",
                    )

        return cls(unified_xr)  # NOTE: we use `cls` rather than `Dataset` so any
        # subclasses will use the subclass rather than parent

Ancestors

langbrainscore.interface.dataset._Dataset
langbrainscore.interface.cacheable._Cacheable
typing.Protocol
typing.Generic
abc.ABC

Class variables

var dataset_name : str

Static methods

def from_file_or_url(file_path_or_url: Union[str, pathlib.Path], data_column: str, sampleid_index: str, neuroid_index: str, stimuli_index: str, timeid_index: str = None, subject_index: str = None, sampleid_metadata: Union[Iterable[str], Mapping[str, str]] = None, neuroid_metadata: Union[Iterable[str], Mapping[str, str]] = None, timeid_metadata: Union[Iterable[str], Mapping[str, str]] = None, multidim_metadata: Iterable[Mapping[str, Iterable[str]]] = None, sort_by: Iterable[str] = (), sep=',', parallel: int = -2) ‑> langbrainscore.interface.dataset._Dataset

Creates a Dataset object holding an xr.DataArray instance using a CSV file readable by pandas. Constructs the xr.DataArray using specified columns to construct dimensions and metadata along those dimensions in the form of coordinates. Minimally requires sampleid and neuroid to be provided.

Note: Each row of the supplied file must have a single data point corresponding
to a unique <code>sampleid</code>, <code>neuroid</code>, and <code>timeid</code> (unique dimension values).
I.e., each neuroid (which could be a voxel, an ROI, a reaction time RT value, etc.)
must be on a new line for the same stimulus trial at a certain time.
If <code>timeid</code> and <code>subjectid</code> is not provided:
    - a singleton timeid dimension is created with the value "0" for each sample.
    - a singleton subjectid dimension is created with value "0" that spans the entire data.
For help on what these terms mean, please visit the
[xarray glossary page](https://xarray.pydata.org/en/stable/user-guide/terminology.html)

Args

file_path_or_url : typing.Union[str, Path]: a path or URL to a csv file
data_column : str: title of the column that holds the datapoints per unit of measurement (e.g., BOLD contrast effect size, reaction time, voltage amplitude, etc)
sampleid_index : str: title of the column that should be used to construct an index for sampleids. this should be unique for each stimulus in the dataset.
neuroid_index : str: title of the column that should be used to construct an index for neuroids. this should be unique for each point of measurement within a subject. e.g., voxel1, voxel2, … neuroids in the packaged dataset are transformed to be a product of subject_index and neuroid_index.
stimuli_index : str: title of the column that holds stimuli shown to participants
timeid_index : str, optional: title of the column that holds timepoints of stimulus presentation. optional. if not provided, a singleton timepoint '0' is assigned to each datapoint. Defaults to None.
subject_index : str, optional: title of the column specifiying subject IDs. Defaults to None.
sampleid_metadata (typing.Union[typing.Iterable[str], typing.Mapping[str,str]], optional):
names of columns (and optionally mapping of existing column names to new coordinate names)
that supply metadata along the sampleid dimension. Defaults to None.
neuroid_metadata (typing.Union[typing.Iterable[str], typing.Mapping[str,str]], optional):
see sampleid_metadata. Defaults to None.
timeid_metadata (typing.Union[typing.Iterable[str], typing.Mapping[str,str]], optional):
see sampleid_metadata. Defaults to None.
multidim_metadata (typing.Iterable[typing.Mapping[str, typing.Iterable[str]]], optional):
metadata to go with more than one dimension. e.g., chunks of a stimulus that unfolds with time.
currently NotImplemented. Defaults to None.
sort_by : typing.Iterable[str], optional: Sort data by these columns while repackaging it. data is sorted by sampleid_index, neuroid_index, and timeid_index in addition to this value. Defaults to ().
sep : str, optional: separator to read a value-delimited file. this argument is passed to pandas. Defaults to ','.

Raises

ValueError: description

Returns

_Dataset: a subclass of the _Dataset interface with the packaged xarray.DataArray as a member.

Expand source code

@classmethod
def from_file_or_url(
    cls,
    file_path_or_url: typing.Union[str, Path],
    data_column: str,
    sampleid_index: str,
    neuroid_index: str,
    stimuli_index: str,
    timeid_index: str = None,
    subject_index: str = None,
    sampleid_metadata: typing.Union[
        typing.Iterable[str], typing.Mapping[str, str]
    ] = None,
    neuroid_metadata: typing.Union[
        typing.Iterable[str], typing.Mapping[str, str]
    ] = None,
    timeid_metadata: typing.Union[
        typing.Iterable[str], typing.Mapping[str, str]
    ] = None,
    multidim_metadata: typing.Iterable[
        typing.Mapping[str, typing.Iterable[str]]
    ] = None,
    sort_by: typing.Iterable[str] = (),
    sep=",",
    parallel: int = -2,
) -> _Dataset:
    """Creates a Dataset object holding an `xr.DataArray` instance using a CSV file readable by pandas.
        Constructs the `xr.DataArray` using specified columns to construct dimensions and
        metadata along those dimensions in the form of coordinates.
        Minimally requires `sampleid` and `neuroid` to be provided.

        Note: Each row of the supplied file must have a single data point corresponding
        to a unique `sampleid`, `neuroid`, and `timeid` (unique dimension values).
        I.e., each neuroid (which could be a voxel, an ROI, a reaction time RT value, etc.)
        must be on a new line for the same stimulus trial at a certain time.
        If `timeid` and `subjectid` is not provided:
            - a singleton timeid dimension is created with the value "0" for each sample.
            - a singleton subjectid dimension is created with value "0" that spans the entire data.
        For help on what these terms mean, please visit the
        [xarray glossary page](https://xarray.pydata.org/en/stable/user-guide/terminology.html)


    Args:
        file_path_or_url (typing.Union[str, Path]): a path or URL to a csv file
        data_column (str): title of the column that holds the datapoints per unit of measurement
            (e.g., BOLD contrast effect size, reaction time, voltage amplitude, etc)
        sampleid_index (str): title of the column that should be used to construct an index for sampleids.
            this should be unique for each stimulus in the dataset.
        neuroid_index (str): title of the column that should be used to construct an index for neuroids.
            this should be unique for each point of measurement within a subject. e.g., voxel1, voxel2, ...
            neuroids in the packaged dataset are transformed to be a product of subject_index and neuroid_index.
        stimuli_index (str): title of the column that holds stimuli shown to participants
        timeid_index (str, optional): title of the column that holds timepoints of stimulus presentation.
            optional. if not provided, a singleton timepoint '0' is assigned to each datapoint. Defaults to None.
        subject_index (str, optional): title of the column specifiying subject IDs. Defaults to None.
        sampleid_metadata (typing.Union[typing.Iterable[str], typing.Mapping[str,str]], optional):
            names of columns (and optionally mapping of existing column names to new coordinate names)
            that supply metadata along the sampleid dimension. Defaults to None.
        neuroid_metadata (typing.Union[typing.Iterable[str], typing.Mapping[str,str]], optional):
            see `sampleid_metadata`. Defaults to None.
        timeid_metadata (typing.Union[typing.Iterable[str], typing.Mapping[str,str]], optional):
            see `sampleid_metadata`. Defaults to None.
        multidim_metadata (typing.Iterable[typing.Mapping[str, typing.Iterable[str]]], optional):
            metadata to go with more than one dimension. e.g., chunks of a stimulus that unfolds with time.
            currently `NotImplemented`. Defaults to None.
        sort_by (typing.Iterable[str], optional): Sort data by these columns while repackaging it.
            data is sorted by `sampleid_index`, `neuroid_index`, and `timeid_index` in addition to this
            value. Defaults to ().
        sep (str, optional): separator to read a value-delimited file. this argument is passed to pandas.
            Defaults to ','.

    Raises:
        ValueError: _description_

    Returns:
        _Dataset: a subclass of the `_Dataset` interface with the packaged xarray.DataArray as a member.
    """

    T = typing.TypeVar("T")

    def collapse_same_value(arr: typing.Iterable[T]) -> T:
        """
        makes sure each element in an iterable is identical (using __eq__)
        to every other element by value and returns (any) one of the elements.
        if a non-identical element (!=) is found, raises ValueError
        """
        try:
            first_thing = next(iter(arr))
        except StopIteration:
            log(f"failed to obtain value from {arr}", verbosity_check=True)
            return np.nan
        for each_thing in arr:
            if first_thing != each_thing:
                raise ValueError(f"{first_thing} != {each_thing}")
        return first_thing

    import pandas as pd

    if str(file_path_or_url).endswith(".parquet.gzip"):
        try:
            df = pd.read_parquet(file_path_or_url)
        except Exception as invalid_file:
            raise ValueError("invalid parquet file / filename") from invalid_file
    else:
        try:
            df = pd.read_csv(file_path_or_url, sep=sep)
        except Exception as invalid_file:
            raise ValueError("invalid csv file / filename") from invalid_file

    if timeid_index is None:
        timeid_index = "timeid"
        # create singleton timeid
        # we don't need to inflate data since each datapoint will just
        # correspond to timeid == 0 per sample
        timeid_column = [0] * len(df)
        df[timeid_index] = timeid_column
    if subject_index is None:
        subject_index = "subject"
        # create singleton subjectID
        # we don't need to inflate data since each datapoint will just
        # correspond to subject == 0 per sample
        subject_column = ["subject0"] * len(df)
        df[subject_index] = subject_column
    if not parallel:
        parallel = 1

    subjects = list(set(df[subject_index]))
    sampleids = list(
        set(df[sampleid_index])
    )  # what happens when the same stimulus is shown multiple times?
    # it will add entries with same sampleid that will have to
    # then be differentiated on the basis of metadata only
    # https://i.imgur.com/4V2DsIo.png
    neuroids = list(set(df[neuroid_index]))
    timeids = list(set(df[timeid_index]))

    if not isinstance(sampleid_metadata, abc.Mapping):
        sampleid_metadata = {k: k for k in sampleid_metadata}
    if not isinstance(neuroid_metadata, abc.Mapping):
        neuroid_metadata = {k: k for k in neuroid_metadata}
    if not isinstance(timeid_metadata, abc.Mapping):
        timeid_metadata = {k: k for k in timeid_metadata or ()}

    df = df.sort_values(
        [*sort_by, subject_index, sampleid_index, neuroid_index, timeid_index]
    )

    def get_sampleid_xr(sampleid):
        sample_view = df[df[sampleid_index] == sampleid]  # why not sampleid_view?

        neuroid_xrs = []
        for neuroid in neuroids:
            neuroid_view = sample_view[sample_view[neuroid_index] == neuroid]

            timeid_xrs = []
            for timeid in timeids:
                timeid_view = neuroid_view[neuroid_view[timeid_index] == timeid]
                data = timeid_view[data_column].values
                timeid_xr = xr.DataArray(
                    data.reshape(1, len(timeid_view[subject_index]), 1),
                    dims=("sampleid", "neuroid", "timeid"),
                    coords={
                        "sampleid": np.repeat(sampleid, 1),
                        "neuroid": [
                            f"{a}_{b}"
                            for a, b in zip(
                                timeid_view[subject_index],
                                timeid_view[neuroid_index],
                            )
                        ],
                        "timeid": np.repeat(timeid, 1),
                        "subject": ("neuroid", timeid_view[subject_index]),
                        "stimulus": (
                            "sampleid",
                            [collapse_same_value(timeid_view[stimuli_index])],
                        ),
                        **{
                            metadata_names[column]: (
                                dimension,
                                [collapse_same_value(timeid_view[column])],
                            )
                            for dimension, metadata_names in (
                                ("sampleid", sampleid_metadata),
                                ("timeid", timeid_metadata),
                            )
                            for column in metadata_names
                        },
                        **{
                            neuroid_metadata[column]: (
                                "neuroid",
                                (timeid_view[column]),
                            )
                            for column in neuroid_metadata
                        },
                    },
                )
                timeid_xrs += [timeid_xr]

            neuroid_xr = xr.concat(timeid_xrs, dim="timeid")
            neuroid_xrs += [neuroid_xr]

        sampleid_xr = xr.concat(neuroid_xrs, dim="neuroid")
        return sampleid_xr

    sampleid_xrs = Parallel(n_jobs=parallel)(
        delayed(get_sampleid_xr)(sampleid)
        for sampleid in tqdm(sampleids, desc="reassembling data per sampleid")
    )

    unified_xr = xr.concat(sampleid_xrs, dim="sampleid")

    for dimension, metadata_names in (
        ("sampleid", {**sampleid_metadata, "stimulus": "stimulus"}),
        ("timeid", timeid_metadata),
        ("neuroid", {**neuroid_metadata, "subject": "subject"}),
    ):
        for column in metadata_names:
            try:
                unified_xr = collapse_multidim_coord(
                    unified_xr, metadata_names[column], dimension
                )
            except ValueError as e:
                log(
                    f"dimension:{dimension}, column:{column}, shape:{unified_xr[metadata_names[column]].shape}",
                    type="ERR",
                )

    return cls(unified_xr)  # NOTE: we use `cls` rather than `Dataset` so any
    # subclasses will use the subclass rather than parent

def load_netcdf(filename)

loads a netCDF object that contains a pre-packaged xarray instance from a file at filename.

Expand source code

@classmethod
def load_netcdf(cls, filename):
    """
    loads a netCDF object that contains a pre-packaged xarray instance from
    a file at `filename`.
    """
    return cls(xr.load_dataarray(filename))

Instance variables

var contents : xarray.core.dataarray.DataArray

access the internal xarray object. use with caution.

Expand source code

@property
def contents(self) -> xr.DataArray:
    """
    access the internal xarray object. use with caution.
    """
    return self._xr_obj

var dims : tuple

getter method that returns internal xarray dimensions

Returns

tuple[str]: dimensions of internal xarray object

Expand source code

@property
def dims(self) -> tuple:
    """
    getter method that returns internal xarray dimensions

    Returns:
        tuple[str]: dimensions of internal xarray object
    """
    return self.contents.dims

var stimuli : xarray.core.dataarray.DataArray

getter method that returns an xarray object of stimuli and associated metadata

Returns

xr.DataArray: xarray object containing the stimuli from the dataset and associated metadata

Expand source code

@property
def stimuli(self) -> xr.DataArray:
    """
    getter method that returns an xarray object of stimuli and associated metadata

    Returns:
        xr.DataArray: xarray object containing the stimuli from the dataset and associated metadata
    """
    return self.contents.stimulus

Methods

def to_netcdf(self, filename)

outputs the xarray.DataArray object to a netCDF file identified by filename. if it already exists, overwrites it.

Expand source code

def to_netcdf(self, filename):
    """
    outputs the xarray.DataArray object to a netCDF file identified by
    `filename`. if it already exists, overwrites it.
    """
    if Path(filename).expanduser().resolve().exists():
        log(f"{filename} already exists. overwriting.", type="WARN")
    self._xr_obj.to_netcdf(filename)