Commit b4b1bc14 authored by Imanol Perez's avatar Imanol Perez
Browse files

Upload project

parent f292dbfe
# Predicting Mortality of ICU Patients: the PhysioNet/Computing in Cardiology Challenge 2012
Website of the challenge: https://physionet.org/challenge/2012/#challenge-scoring
import numpy as np
def split(X, Y, proportion=0.75):
'''
Splits dataset into a training and testing set
'''
print("Splitting the dataset...")
idx = int(len(X)*proportion)
print("Dataset split in a training set of %s and testing set of %s patients."%(idx, len(X)-idx))
return X[:idx], Y[:idx], X[idx:], Y[idx:]
File added
import urllib
import pandas as pd
import numpy as np
import zipfile
from os import listdir
from os.path import isfile, join
def get_inputs():
'''
Downloads input data.
'''
print("Downloading input data from physionet.org...")
url = "https://physionet.org/challenge/2012/set-a.zip"
f = urllib.URLopener()
f.retrieve(url, "data/input.zip")
print("Extracting input data...")
zip_ref = zipfile.ZipFile("data/input.zip", 'r')
zip_ref.extractall("data/")
zip_ref.close()
data = {}
list_files = [f for f in listdir("data/set-a") if isfile(join("data/set-a", f))]
for f in list_files:
df = pd.read_csv(join("data/set-a", f))
patient_id = int(df.values[0, 2])
data[patient_id] = df
return data
def get_outputs():
'''
Downloads output data
'''
print("Downloading output data from physionet.org...")
url = "https://physionet.org/challenge/2012/Outcomes-a.txt"
data_df = pd.read_csv(url)
print("Extracting output data...")
data = {}
for patient in data_df.values:
patient_id = int(patient[0])
data[patient_id] = patient[-1]
return data
def download():
X_dict, Y_dict = get_inputs(), get_outputs()
X = []
Y = []
for patient_id in X_dict:
X.append(X_dict[patient_id])
Y.append(Y_dict[patient_id])
print("Data for %s patients downloaded."%len(X))
return X, Y
File added
import numpy as np
def features_point(x):
'''
Extracts hand-crafted features from a datapoint.
'''
static, path = x
maximums = np.max(path, axis=0)
minimums = np.min(path, axis=0)
last_observation = path[-1]
return np.concatenate([static, maximums, minimums, last_observation])
def extract(X):
'''
Extracts hand-crafted features from a datapoint.
'''
print("Extracting features...")
return list(map(features_point, X))
File added
import download
import reformat
import normalise
import features
import model
import performance
import dataset
if __name__ == "__main__":
# We begin by downloading the data. The data will be in the form of
# "events" data: each datapoint for each patient will be a recorded event.
X, Y = download.download()
# The event data is reformatted. This is done by selecting the given
# variables and transforming time-dependent events to a path.
X = reformat.reformat(X, static_variables=["Age", "Gender"],
dynamic_variables=["Creatinine", "Glucose"])
# Now, we normalise the data.
X = normalise.normalise(X)
# We extract features from the input data.
features = features.extract(X)
# The dataset is now split into a training and testing set.
features_train, Y_train, features_test, Y_test = dataset.split(features, Y, proportion=0.75)
# We now train the model with the selected features.
classifier = model.train(features_train, Y_train)
# We evaluate performance of the model now.
performance.evaluate(classifier, features_test, Y_test)
from sklearn.ensemble import RandomForestClassifier
def train(features, Y):
'''
Trains a random forest classifier
'''
print("Training the model...")
classifier = RandomForestClassifier()
classifier.fit(features, Y)
return classifier
File added
import numpy as np
def normalise_point(x):
'''
We normalise each datapoint by dividing time by 2 (the number of
minimum days that patients stayed in ICU).
'''
static, path = x
path[:, 0] /= 2.
return [static, path]
def normalise(X):
'''
Normalises the dataset.
'''
print("Normalising the data...")
return list(map(normalise_point, X))
File added
import numpy as np
from sklearn.metrics import roc_auc_score, confusion_matrix
def evaluate(classifier, features, Y):
'''
Evaluates the performance of the model
'''
print("Evaluating performance...")
predictions = classifier.predict_proba(features)[:, 1]
roc = roc_auc_score(Y, predictions)
print("ROC of predictions: %s"%roc)
#predictions = classifier.predict(features)
#cm = confusion_matrix(Y, predictions)
#Se = cm[1, 1] / float(cm[1, 0] + cm[1, 0])
#P = cm[1, 1] / float(cm[1, 0] + cm[0, 1])
#score = min(Se, P)
#print("Score of predictions: %s"%score)
File added
import pandas as pd
import numpy as np
import urllib
from os import listdir
from os.path import isfile, join
import zipfile
import download
import copy
def to_path(df, dynamic_variables):
'''
Constructs a path from the given dynamic variables
'''
dim = len(dynamic_variables) + 1
path = [[0.]*dim]
for event in df.values:
if event[1] in dynamic_variables:
new_value = copy.deepcopy(path[-1])
idx = 1 + dynamic_variables.index(event[1])
new_value[idx] = event[2]
hour, min = event[0].split(":")
days = (float(hour) + float(min) / 60.)/24.
new_value[0] = days
path.append(new_value)
path = np.array(path)
# Now, for each time we only need one datapoint
unique_times = np.unique(path[:, 0])
idx = []
for time in unique_times:
last_idx = np.where(path[:, 0] == time)[0][-1]
idx.append(last_idx)
path = path[idx]
return path
def static_features(df, static_variables):
'''
Retrieves the given static variables
'''
return df[df["Parameter"].isin(static_variables)]["Value"].values
def reformat(X, static_variables, dynamic_variables):
'''
Reformates a patient history
'''
print("Reformatting input data...")
for i, x in enumerate(X):
dynamic = to_path(x, dynamic_variables=dynamic_variables)
static = static_features(x, static_variables=static_variables)
X[i] = [static, dynamic]
return X
File added
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment