Source code for visiannot.tools.ToolsData

# -*- coding: utf-8 -*-
#
# Copyright Université Rennes 1 / INSERM
# Contributor: Raphael Weber
#
# Under CeCILL license
# http://www.cecill.info

"""
Module with functions for loading and saving data files
"""


import numpy as np
import sys
from os.path import isfile, split, abspath, dirname, realpath
from scipy.io import loadmat
from h5py import File, Dataset
from .ToolsAudio import getDataAudio


[docs]def getWorkingDirectory(path):
    """
    Gets working directory when ViSiAnnoT is launched, which depends on wether
    it is launched as a Python script or an executable (generated with
    PyInstaller)

    Typically, ``path`` is the path to a Python module of **visiannot** that is
    being executed.

    In case it is launched as a Python script, it returns the absolute path to
    the directory containing the module.

    In case it is launched as an executable generated with PyInstaller, it
    returns the path to the temporary directory created by PyInstaller where
    are putted source code and related data files.

    :param path: typically ``__file``
    :type path: str
    """

    if hasattr(sys, "_MEIPASS"):
        path_w = abspath(getattr(sys, "_MEIPASS"))

    else:
        path_w = dirname(realpath(path))

    return path_w


[docs]def convertIntervalsToTimeSeries(intervals, nframes):
    """
    Converts intervals as 2D array to a time series of 0 and 1 (1D array)

    :param intervals: intervals in frame numbers, shape
        :math:`(n_{intervals}, 2)`
    :type intervals: numpy array or list
    :param nframes: number of frames of the time series

    :returns: intervals as a time series, shape :math:`(n_{frames},)`
    :rtype: numpy array

    If the end time of an interval is -1 (second column of ``intervals``, then
    the end time is set to nframes.

    Example::
        >>> a = np.array([[4, 5], [9, 12], [16, -1]])
        >>> convertIntervalsToTimeSeries(a, 20)
        array([0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0.,
               1., 1., 1., 1.])
    """

    if isinstance(intervals, np.ndarray):
        intervals = intervals.astype(int)
    time_series = np.zeros((nframes,))
    for interval in intervals:
        if interval[1] == -1:
            interval[1] = nframes
        time_series[interval[0]:interval[1]] = 1

    return time_series


[docs]def convertTimeSeriesToIntervals(data, value):
    """
    Gets the intervals of a 1D signal with a specific value

    :param data: 1D array
    :type data: numpy array
    :param value: value that defines the intervals to retrieve from ``data``

    :return: 2D array with indexes of intervals (ending index is not included
        in the interval, as with ``range`` in Python)
    :rtype: numpy array

    Example::
        >>> a = np.array([0, 0, 0, 0, 5, 1, 1, 1, 1, 5, 5, 5, 0, 0, 0, 0])
        >>> convertTimeSeriesToIntervals(a,0)
        array([[ 0,  4],
               [12, 16]])
        >>> convertTimeSeriesToIntervals(a,1)
        array([[5, 9]])
        >>> convertTimeSeriesToIntervals(a,5)
        array([[ 4,  5],
               [ 9, 12]])
    """

    if np.isnan(value):
        inds = np.where(np.isnan(data))[0]

    else:
        inds = np.where(data == value)[0]

    if inds.shape[0] == 0:
        return np.empty((0, 2), dtype=int)

    else:
        inds_inds_diff = np.where(np.diff(inds) > 1)[0]

        start_inds = np.hstack((inds[0:1], inds[inds_inds_diff + 1]))

        end_inds = np.hstack(
            (inds[inds_inds_diff] + 1, np.array([inds[-1] + 1]))
        )

        return np.vstack((start_inds, end_inds)).T


[docs]def getDataInterval(path, key=""):
    """
    Loads file containing temporal intervals, output shape
    :math:`(n_{intervals},2)`

    The file format must be supported by :func:`.ToolsData.getDataGeneric`.

    The data can be stored in two ways:

    - shape :math:`(n_{intervals},2)`, where each line contains the start frame
      and end frame of an interval, then no conversion is needed
    - shape :math:`(n_{samples},)` with 0 and 1, then it is converted to shape
      :math:`(n_{intervals},2)`

    :param path: path to the data file
    :type path: str
    :param key: key to access the data in case of mat or h5 file, for txt file
        it is ignored
    :type key: str

    :returns: numpy array of shape :math:`(n_{intervals},2)` with intervals in
        frames number
    :rtype: numpy array
    """

    if isfile(path):
        data_array = np.squeeze(getDataGeneric(path, key=key, ndmin=2))
        if data_array.ndim == 1:
            data_array = convertTimeSeriesToIntervals(data_array, 1)
        elif data_array.shape[0] == 0:
            data_array = np.empty((0, 2))
    else:
        data_array = np.empty((0, 2))

    return data_array


[docs]def getDataIntervalAsTimeSeries(path, n_samples, key=""):
    """
    Loads file containing temporal intervals, output shape
    :math:`(n_{samples},)`

    The data can be stored in two ways:

    - shape :math:`(n_{intervals},2)`, where each line contains the start frame
      and end frame of an interval, then it is converted to shape
      :math:`(n_{samples},)`, so the number of frames must be specified
      (allowed formats: txt, mat, h5)
    - shape :math:`(n_{samples},)` with 0 and 1, then no conversion is needed
      (allowed formats: mat, h5)

    :param path: path to the data file
    :type path: str
    :param n_samples: number of samples of the time series
    :type n_samples: int
    :param key: key to access the data in case of mat or h5 file, for txt file
        it is ignored
    :type key: str

    :returns: numpy array of shape :math:`(n_{samples},)` with intervals as a
        time series of 0 and 1
    :rtype: numpy array
    """

    if isfile(path):
        data_array = getDataGeneric(path, key=key, ndmin=2)
        if data_array.ndim == 2:
            data_array[np.where(data_array < 0)] = 0
            data_array = convertIntervalsToTimeSeries(data_array, n_samples)

    else:
        print("Time series full of NaN because file not found: %s" % path)
        data_array = np.nan * np.ones((n_samples,))

    return data_array


[docs]def getTxtLines(path):
    """
    Loads a file as a list of lines

    :param path: path to the text file
    :type pat: str

    :returns: list of strings with the lines of the file
    :rtype: list
    """

    with open(path, 'r') as f:
        lines = f.readlines()

    return lines


[docs]def getDataGeneric(path, key="", **kwargs):
    """
    Loads data from a file with format mat, h5, txt or wav

    :param path: path to the data file
    :type path: str
       string containing the path to the data
    :param key: key to access the data in case of mat or h5 file, for txt file
        it is ignored
    :type key: str
    :param kwargs: keyword arguments of numpy.loadtxt (in case of txt file) or
        :func:`.ToolsAudio.getDataAudio` (in case of wav file)

    :returns: data
    :rtype: numpy array

    It raises an exception if the format is not supported.
    """

    ext = path.split('.')[-1]

    if ext == "mat":
        data = getDataMat(path, key)

    elif ext == "h5":
        data = getDataH5(path, key)

    elif ext == "txt":
        # disable warnings
        from warnings import catch_warnings, simplefilter
        with catch_warnings():
            simplefilter("ignore")
            data = np.loadtxt(path, **kwargs)

    elif ext == "wav":
        _, data, _ = getDataAudio(path, **kwargs)

    else:
        raise Exception("Data format not supported: %s" % ext)

    return data


[docs]def getDataMat(path, key):
    """
    Loads data from a mat file

    :param path: path to the data file
    :type path: str
    :param key: key to access the data
    :type key: str

    :returns: data
    :rtype: numpy array
    """

    # try opening with loadmat, otherwise with h5py
    try:
        data = loadmat(path)[key]

    except Exception:
        data = getDataH5(path, key)

    return np.squeeze(data)


[docs]def getAttributeH5(path, key_path):
    """
    Gets an attribute in a h5 file

    :param path: path to the file
    :type path: str
    :param key_path: key path to the attribute in the file
    :type key_path: str

    :returns: attribute
    """

    dataset_path, key = split(key_path)

    with File(path, 'r') as f:
        if dataset_path != "":
            attr = f[dataset_path].attrs[key]

        else:
            attr = f.attrs[key]

    return attr


[docs]def getAttributeGeneric(path, key):
    """
    Gets an attribute in a mat or h5 file

    :param path: path to the file
    :type path: str
    :param key_path: key path to the attribute in the file
    :type key_path: str

    If the file is not mat or h5, it returns key.

    :returns: attribute
    """

    ext = path.split('.')[-1]

    if ext == "mat":
        attr = getDataMat(path, key)

    elif ext == "h5":
        attr = getAttributeH5(path, key)

    else:
        attr = key

    return attr


[docs]def recursiveReadH5(parent_item):
    """
    Recursive function to read data from a h5py file object while preserving
    nested architecture

    It reaches the last group level recursively.

    If the parent item is a H5 dataset, then the function returns a numpy
    array. Otherwise it returns a dictionary, where the key corresponds to one
    H5 group and the value correponds to the H5 group content (may it be a
    nested group, a numpy array in case of H5 dataset or a string/int/float in
    case of H5 attribute). The attributes of a H5 dataset are not retrieved,
    it only works for attributes of a H5 group.

    :param parent_item: h5py file object

    :returns: all data contained in ``parent_item``
    :rtype: dict or numpy array
    """

    # check if parent item is a dataset
    if isinstance(parent_item, Dataset):
        # output is a numpy array
        output = parent_item[()]

    else:
        # output is a dictionary
        output = {}

        # get attributes
        for key, value in parent_item.attrs.items():
            output[key] = value

        # loop on items of the next level
        for key, item in parent_item.items():
            # recursive call
            output[key] = recursiveReadH5(item)

    return output


[docs]def getDataH5(path, root_path='/'):
    """
    Reads the whole content of a H5 file or a specific dataset/group

    It calls the recursive function :func:`.recursiveReadH5`.

    :param path: path to the file
    :type path: str
    :param root_path: path to the H5 group or H5 dataset where to start
        retrieving data, default ``'/'`` (file root)
    :type key_path: str

    :returns: three options:

    - (*dict*) -- in case ``root_path`` points to a H5 group, all data
      contained in the H5 group
    - (*numpy array*) -- in case ``root_path`` points to a H5 dataset
    - ``None`` -- in case ``root_path`` points to a location that is not in the
      file
    """

    with File(path, 'r') as f:
        if root_path in f:
            output = recursiveReadH5(f[root_path])
        else:
            output = None

    return output