Source code for yall.datasets.base

import os
import numpy as np
from collections import namedtuple


# TODO: Maybe just use sklearn.utils.Bunch
Bunch = namedtuple("Bunch", ["data", "target", "filenames"])


# TODO: Write unit tests for data loading functions
[docs]def load_dexter(): module_path = os.path.dirname(__file__) data_path = os.path.join(module_path, "data/dexter") train_data_file = os.path.join(data_path, "train.data") valid_data_file = os.path.join(data_path, "valid.data") train_labels_file = os.path.join(data_path, "train.labels") valid_labels_file = os.path.join(data_path, "valid.labels") with open(train_data_file, 'r') as inF: train_data = [l.strip().split() for l in inF] with open(valid_data_file, 'r') as inF: valid_data = [l.strip().split() for l in inF] with open(train_labels_file, 'r') as inF: train_labels = [l.strip() for l in inF] with open(valid_labels_file, 'r') as inF: valid_labels = [l.strip() for l in inF] D = 20000 # Dimension of the features in_data = train_data + valid_data target = np.array(train_labels + valid_labels, dtype=int) out_data = np.zeros(shape=(len(in_data), D), dtype=int) for i in range(len(in_data)): # Array of [index, frequency] x = np.array([iv.split(':') for iv in in_data[i]], dtype=int) out_data[i, x[:, 0]] = x[:, 1] # Assign values to indices for row i return Bunch(data=out_data, target=target, filenames=[train_data_file, valid_data_file, train_labels_file, valid_labels_file])
[docs]def load_spect(): module_path = os.path.dirname(__file__) data_path = os.path.join(module_path, "data", "SPECT") train_data_file = os.path.join(data_path, "SPECT.train") test_data_file = os.path.join(data_path, "SPECT.test") train_data = np.loadtxt(train_data_file, delimiter=',', dtype=int) test_data = np.loadtxt(test_data_file, delimiter=',', dtype=int) train_X = train_data[:, 1:] # First column is target test_X = test_data[:, 1:] train_y = train_data[:, 0] test_y = test_data[:, 0] data = np.vstack([train_X, test_X]) target = np.concatenate([train_y, test_y]) return Bunch(data=data, target=target, filenames=[train_data_file, test_data_file])
[docs]def load_spectf(): module_path = os.path.dirname(__file__) data_path = os.path.join(module_path, "data", "SPECT") train_data_file = os.path.join(data_path, "SPECTF.train") test_data_file = os.path.join(data_path, "SPECTF.test") train_data = np.loadtxt(train_data_file, delimiter=',', dtype=int) test_data = np.loadtxt(test_data_file, delimiter=',', dtype=int) train_X = train_data[:, 1:] # First column is target test_X = test_data[:, 1:] train_y = train_data[:, 0] test_y = test_data[:, 0] data = np.vstack([train_X, test_X]) target = np.concatenate([train_y, test_y]) return Bunch(data=data, target=target, filenames=[train_data_file, test_data_file])