Jul-17-2019, 12:03 PM
I am doing feature selection using python 2.7, the dataset was loaded fine but when i run the code it gives me this error
#!/usr/bin/env python
'''
An example file to show how to use the feature-selection code in ml_lib
'''
import os
import shutil
import json
from tempfile import mkdtemp
from tqdm import tqdm
from scipy.stats import randint as sp_randint
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_curve
from sklearn.externals import joblib
from sklearn.externals.joblib import Memory
from sklearn.metrics import matthews_corrcoef
import gplearn
#import gplearn.genetic
import gplearn.fitness
import numpy as np
from joblib import Parallel, delayed
from sklearn.externals.joblib import Parallel
from joblib import load, dump
from identity_transformer import IdentityTransformer
import depmeas
import pandas
from tqdm import tqdm
import pandas as pd
import csv
from sklearn.datasets import fetch_mldata
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import numpy as np
from multiprocessing import Process
import seaborn as sns
import matplotlib.pyplot as plt
import feature_select
import depmeas
if __name__ == '__main__':
def generic_combined_scorer(x1, o1, ii_1, x2, o2, ii_2, y, h):
s1 = h(x1, y)
s2 = h(x2, y)
o1[ii_1] = s1
o2[ii_2] = s2
NUM_CV = 3
RANDOM_SEED = 123
MAX_ITER = 1000
# leuk = fetch_mldata('iris', transpose_data=True)
X = pd.read_csv(r'C:\Users\pc\Desktop\dataset\leukemia.csv')
y = pd.read_csv(r'C:\Users\pc\Desktop\dataset\leukemia.csv')
# perform feature selection
num_features_to_select = 25
K_MAX = 1000
estimator = depmeas.mi_tau
n_jobs = -1
verbose = True
# print ( X.head(20))
num_dim = X.shape[1]
#print (num_dim)
if (num_features_to_select is not None):
num_selected_features = min(num_dim, num_features_to_select)
else:
num_selected_features = num_dim
K_MAX_internal = min(num_dim, K_MAX)
initial_scores = Parallel(n_jobs=n_jobs)(delayed(estimator)(X.iloc[:, 1], y.iloc[:, 1]) for ii in range(num_dim))
# rank the scores in descending order
sorted_scores_idxs = np.flipud(np.argsort(initial_scores))
# subset the data down so that joblib doesn't have to
# transport large matrices to its workers
X_subset = X.iloc[:, sorted_scores_idxs[0:K_MAX_internal]]
# memory map this for parallelization speed
tmp_folder = mkdtemp()
# TODO: why is X_subset crashing when we increase K_MAX_in? Investigate in detail, but
# for now, do not use memory mapping for X_subset for stability
# X_subset_fname = os.path.join(tmp_folder, 'X_subset')
# dump(X_subset, X_subset_fname)
# X_subset = load(X_subset_fname, mmap_mode='r')
selected_feature_idxs = np.zeros(num_selected_features, dtype=int)
remaining_candidate_idxs = range(1, K_MAX_internal)
# mi_matrix = np.empty((K_MAX_internal,num_selected_features-1))
# mi_matrix[:] = np.nan
relevance_vec_fname = os.path.join(tmp_folder, 'relevance_vec')
feature_redundance_vec_fname = os.path.join(tmp_folder, 'feature_redundance_vec')
mi_matrix_fname = os.path.join(tmp_folder, 'mi_matrix')
relevance_vec = np.memmap(relevance_vec_fname, dtype=float,
shape=(K_MAX_internal,), mode='w+')
feature_redundance_vec = np.memmap(feature_redundance_vec_fname, dtype=float,
shape=(K_MAX_internal,), mode='w+')
mi_matrix = np.memmap(mi_matrix_fname, dtype=float,
shape=(K_MAX_internal, num_selected_features - 1), mode='w+')
mi_matrix[:] = np.nan
# TODO: investigate whether its worth it to parallelize the nested for-loop?
with tqdm(total=num_selected_features, desc='Selecting Features ...', disable=(not verbose)) as pbar:
pbar.update(1)
for k in range(1, num_selected_features):
ncand = len(remaining_candidate_idxs)
last_selected_feature = k - 1
Parallel(n_jobs=n_jobs)(delayed(generic_combined_scorer)(y, relevance_vec, ii,
X_subset[:, selected_feature_idxs[last_selected_feature]],
feature_redundance_vec, ii, X_subset.iloc[:, ii],
estimator)for ii in remaining_candidate_idxs)
# copy the redundance into the mi_matrix, which accumulates our redundance as we compute
mi_matrix[remaining_candidate_idxs, last_selected_feature] = feature_redundance_vec[remaining_candidate_idxs]
redundance_vec = np.nanmean(mi_matrix[remaining_candidate_idxs, :], axis=1)
tmp_idx = np.argmax(relevance_vec[remaining_candidate_idxs] - redundance_vec)
selected_feature_idxs[k] = remaining_candidate_idxs[tmp_idx]
del remaining_candidate_idxs[tmp_idx]
pbar.update(1)
# map the selected features back to the original dimensions
selected_feature_idxs = sorted_scores_idxs[selected_feature_idxs]
print('Leukemia Dataset Feature Selection\n Total # Features=%d' % (X.shape[1]))
print('# Selected Features')
print('selected_feature_idxs')the Error is :Error:C:\Python2\python.exe C:/Users/pc/PycharmProjects/MymrmrTest/mytestmRmR.py
Selecting Features ...: 4%|▍ | 1/25 [00:00<00:00, 500.04it/s]
Traceback (most recent call last):
File "C:/Users/pc/PycharmProjects/MymrmrTest/mytestmRmR.py", line 143, in <module>
estimator)for ii in remaining_candidate_idxs)
File "C:\Users\pc\AppData\Roaming\Python\Python27\site-packages\sklearn\externals\joblib\parallel.py", line 917, in __call__
if self.dispatch_one_batch(iterator):
File "C:\Users\pc\AppData\Roaming\Python\Python27\site-packages\sklearn\externals\joblib\parallel.py", line 754, in dispatch_one_batch
self._pickle_cache)
File "C:\Users\pc\AppData\Roaming\Python\Python27\site-packages\sklearn\externals\joblib\parallel.py", line 210, in __init__
self.items = list(iterator_slice)
File "C:/Users/pc/PycharmProjects/MymrmrTest/mytestmRmR.py", line 143, in <genexpr>
estimator)for ii in remaining_candidate_idxs)
File "C:\Users\pc\AppData\Roaming\Python\Python27\site-packages\pandas\core\frame.py", line 2927, in __getitem__
indexer = self.columns.get_loc(key)
File "C:\Users\pc\AppData\Roaming\Python\Python27\site-packages\pandas\core\indexes\base.py", line 2657, in get_loc
return self._engine.get_loc(key)
File "pandas\_libs\index.pyx", line 108, in pandas._libs.index.IndexEngine.get_loc
File "pandas\_libs\index.pyx", line 110, in pandas._libs.index.IndexEngine.get_loc
TypeError: '(slice(None, None, None), 0)' is an invalid key
Process finished with exit code 1
