Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 76 additions & 31 deletions libsvmdata/datasets.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Author: Mathurin Massias <[email protected]>
# License: BSD 3 clause
import os
import tarfile
from pathlib import Path
from bz2 import BZ2Decompressor

Expand Down Expand Up @@ -50,6 +51,8 @@
'covtype.multiclass': 'multiclass/covtype.bz2',
'covtype.multiclass_scale': 'multiclass/covtype.scale01.bz2',
'cpusmall': 'regression/cpusmall',
'criteo': 'binary/criteo.kaggle2014.svm.tar.xz',
'criteo-test': 'binary/criteo.kaggle2014.svm.tar.xz',
'delicious': 'multilabel/delicious.bz2',
'diabetes': 'binary/diabetes',
'diabetes_scale': 'binary/diabetes_scale',
Expand Down Expand Up @@ -146,6 +149,13 @@
'yeast_test': 'multilabel/yeast_test.svm.bz2',
}

# Mpping to the file to load the data from in the tar archive and
# the list of the other datasets that can be extracted too
TAR_FILE_MEMBERS = {
'criteo': ('binary/criteo.kaggle2014.svm/train.txt.svm', ['criteo-test']),
'criteo-test': ('binary/criteo.kaggle2014.svm/test.txt.svm', ['criteo'])
}

N_FEATURES = {
'a1a': 123,
'a1a_test': 123,
Expand Down Expand Up @@ -185,6 +195,8 @@
'covtype.multiclass': 54,
'covtype.multiclass_scale': 54,
'cpusmall': 12,
'criteo': 1_000_000,
'criteo-test': 1_000_000,
'delicious': 500,
'diabetes': 8,
'diabetes_scale': 8,
Expand Down Expand Up @@ -315,20 +327,32 @@ def _get_X_y(dataset, multilabel, replace=False, verbose=False):
replace=True."""

# some files are compressed, some are not:
if NAMES[dataset].endswith('.bz2'):
stripped_name = NAMES[dataset][:-4]
else:
stripped_name = NAMES[dataset]
stripped_name = NAMES[dataset]
for ext in ['.bz2', '.tar.xz']:
if stripped_name.endswith(ext):
stripped_name = stripped_name[:-len(ext)]

ext = '.npz' if multilabel else '.npy'
y_path = DATA_HOME / f"{stripped_name}_target{ext}"
X_path = DATA_HOME / f"{stripped_name}_data" # no ext to handle npy or npz
if dataset in TAR_FILE_MEMBERS:
tmp_path, others = TAR_FILE_MEMBERS[dataset]
path_mapping = {
str(DATA_HOME / TAR_FILE_MEMBERS[d][0]): d for d in others
}
tmp_path = DATA_HOME / tmp_path
path_mapping[str(tmp_path)] = dataset
else:
tmp_path = DATA_HOME / stripped_name
path_mapping = {str(tmp_path): dataset}

y_path = tmp_path.parent / f"{tmp_path.name}_target{ext}"
# no ext for X to handle npy or npz
X_path = tmp_path.parent / f"{tmp_path.name}_data"

if (replace or not y_path.exists()
or not ((X_path.parent / (X_path.name + '.npy')).exists() or
(X_path.parent / (X_path.name + '.npz')).exists())):
# above, do not use .with_suffix bc of datasets like a1a.t, where the
# method would replace the .t by .npz
tmp_path = DATA_HOME / stripped_name

# Download the dataset
source_path = DATA_HOME / NAMES[dataset]
Expand All @@ -337,7 +361,14 @@ def _get_X_y(dataset, multilabel, replace=False, verbose=False):
download_libsvm(dataset, source_path, replace=replace, verbose=verbose)

# decompress file only if it is compressed
if NAMES[dataset].endswith('.bz2'):
if tarfile.is_tarfile(source_path):
if verbose:
print("Decompressing...")
with tarfile.open(source_path, "r") as f:
f.extractall(source_path.parent)
source_path.unlink()

elif NAMES[dataset].endswith('.bz2'):
decompressor = BZ2Decompressor()
if verbose:
print("Decompressing...")
Expand All @@ -349,30 +380,44 @@ def _get_X_y(dataset, multilabel, replace=False, verbose=False):
n_features_total = N_FEATURES[dataset]
if verbose:
print("Loading svmlight file...")
with open(tmp_path, 'rb') as f:
X, y = load_svmlight_file(
f, n_features=n_features_total, multilabel=multilabel)

tmp_path.unlink()
# if X's density is more than 0.5, store it in dense format:
if len(X.data) >= 0.5 * X.shape[0] * X.shape[1]:
X = X.toarray(order='F')
np.save(X_path, X)
else:
X = sparse.csc_matrix(X)
X.sort_indices()
sparse.save_npz(X_path, X)

if multilabel:
indices = np.array([lab for labels in y for lab in labels])
indptr = np.cumsum([0] + [len(labels) for labels in y])
data = np.ones_like(indices)
Y = sparse.csr_matrix((data, indices, indptr))
sparse.save_npz(y_path, Y)
return X, Y

else:
np.save(y_path, y)
# Loop over multiple files to cope with cases where the dataset is in
# an archive. Store the result for all datasets in the archive and
# keep the result to return in X_y_.
X_y_ = None
for path, d in path_mapping.items():
with open(path, 'rb') as f:
X, y = load_svmlight_file(
f, n_features=n_features_total, multilabel=multilabel
)

# recompute pathes adapt to each dataset in case it is an archive.
y_path = f"{path}_target{ext}"
X_path = f"{path}_data"

# if X's density is more than 0.5, store it in dense format:
if len(X.data) >= 0.5 * X.shape[0] * X.shape[1]:
X = X.toarray(order='F')
np.save(X_path, X)
else:
if not sparse.issparse(X):
X = sparse.csc_matrix(X)
X.sort_indices()
sparse.save_npz(X_path, X)

if multilabel:
indices = np.array([lab for labels in y for lab in labels])
indptr = np.cumsum([0] + [len(labels) for labels in y])
data = np.ones_like(indices)
y = sparse.csr_matrix((data, indices, indptr))
sparse.save_npz(y_path, y)
else:
np.save(y_path, y)

if d == dataset:
X_y_ = X, y

X, y = X_y_

else:
try:
Expand Down