# The implementation for "Network-based protein structural classification" paper
# Logistic Regression framework
# Author: Mahboobeh Ghalehnovi
# Feburary 2020

import collections
from imblearn.over_sampling import SMOTE
from sklearn.metrics import matthews_corrcoef, precision_recall_fscore_support as score
from collections import Counter
from random import shuffle
from sklearn.linear_model import LogisticRegression
import random
import csv
import time
import os
import numpy as np
import warnings
warnings.filterwarnings("ignore")


def data_read(data_directory, file):
    """ Reading a file and returning data and labels.

    Parameters
    ----------
    data_directory : str
        Directory of a dataset.
    file : str
        A text file.

    Returns
    -------
    X : float64
        A numpy matrix of data
    Y : S32
        Labels

    """

    raw_data = open(data_directory + '/' + file, 'r')
    data_list = list(raw_data)
    data_ar = np.array(data_list)
    n = data_ar.size
    ft_m = data_ar[0].split()
    ft_ar_m = np.array(ft_m)
    m = ft_ar_m.size                         # m is the number of columns
    X = np.zeros([n, m - 1], dtype="float")
    Y = np.empty([n], dtype="S32")
    for i in range(0, n):
        ft = data_ar[i].split()
        ft_ar = np.array(ft)
        Y[i] = ft_ar[0]
        for j in range(1, m):
            X[i, j - 1] = ft_ar[j]
    N = np.size(X, 0)
    ind_list = [i for i in range(N)]
    random.Random((2)).shuffle(ind_list)
    X = X[ind_list, :]
    Y = Y[ind_list, ]
    return X, Y


def Sampling_SMOTE(dat_train, labs_train):
    """Doing SMOTE approach sampling.

    Parameters
    ----------
    dat_train : float64
        Data (is a matrix)
    labs_train : S32
        Labels

    Returns
    -------
    dat_train_new : flaot64
        Sampled data with SOMOTE approach
    labs_train_new : S32
        Sampled labels with SOMOTE approach

    """

    sm = SMOTE(random_state=2)
    dat_train_new, labs_train_new = sm.fit_sample(dat_train, labs_train)
    return dat_train_new, labs_train_new


def partition(X, Y, NUM_FOLDS):
    """Dividing the data to NUM_FOLDS folds and return indices.

    Parameters
    ----------
    X : float64
        Data
    Y : S32
         Labels.
    NUM_FOLDS : int
        Number of folds.

    Returns
    -------
    test_indices : int
        Indices of test data
    train_indices : int
        Indices of train data
    keys : S32
        Soretd and unique labels

    """
    Counter_Y = Counter(Y)
    keys = sorted(Counter_Y.keys())
    values = [Counter_Y[key] for key in keys]
    values_of_fold_element = [int(values[i] / NUM_FOLDS)
                              for i in range(len(values))]
    IndexF = [np.where(Y == keys[i])for i in range(len(keys))]

    test_indices = []
    train_indices = []

    for k in range(NUM_FOLDS):
        te_inds_init = []
        te_inds_init = [np.append(te_inds_init, IndexF[i][0][k *
                                                             values_of_fold_element[i]:(k +
                                                                                        1) *
                                                             values_of_fold_element[i]]) for i in range(len(keys))]

        te_inds = [int(val) for sublist in te_inds_init for val in sublist]
        test_indices.append(te_inds)

        tr_inds = [m for m in range(len(Y))if m not in te_inds]
        train_indices.append(tr_inds)
    return test_indices, train_indices, keys


def classifying_LR_l2(
        data_directory,
        file,
        sampling_type,
        metric,
        save_acc_perclass):
    X, Y = data_read(data_directory, file)

    """ Doing 10-fold classification using logistic regression

    Parameters
    ----------
    data_directory : str
        Directory of a dataset.
    file : str
        A text file.
    sampling_type : int
        Type of sampling method
    metric: str
        Metric for evaluation (Accuracy or Matthews correlation coefficient)
    save_acc_perclass: bool
        a flag for whether or not for saving accuracy per class

    Returns
    -------
    score_list_outer : list
        list of accuracies or Matthews correlation coefficients
    acc_all_class_list : list
        If save_acc_perclass is True, this function return a list of accuracy per class

    """

    NUM_FOLDS = 10
    test_indices_outer, train_indices_outer, keys_outer = partition(
        X, Y, NUM_FOLDS)

    score_list_outer = []
    if save_acc_perclass:
        acc_all_class_list = []

    for k in range(NUM_FOLDS):

        test_outer = X[test_indices_outer[k], :]
        test_outer_labs = Y[test_indices_outer[k], ]
        train_outer = X[train_indices_outer[k], :]
        train_outer_labs = Y[train_indices_outer[k], ]

        c = [0.25, 0.5, 1, 2, 4]
        test_indices_iner, train_indices_iner, keys_iner = partition(
            train_outer, train_outer_labs, NUM_FOLDS)
        c_optimal = -1

        if metric == 'ACC':
            score_max = 0
        else:  # metric == 'MCC'
            score_max = -2

        for item in c:
            score_list_inner = []
            for n in range(NUM_FOLDS):
                dat_test = train_outer[test_indices_iner[n], :]
                labs_test = train_outer_labs[test_indices_iner[n], ]
                dat_train = train_outer[train_indices_iner[n], :]
                labs_train = train_outer_labs[train_indices_iner[n], ]

                if sampling_type == 0:
                    dat_train_new = dat_train
                    labs_train_new = labs_train
                else:
                    dat_train_new, labs_train_new = Sampling_SMOTE(
                        dat_train, labs_train)

                model = LogisticRegression(C=item, solver='liblinear')
                model.fit(dat_train_new, labs_train_new)

                if metric == 'ACC':
                    score_each_fold = model.score(dat_test, labs_test)
                    score_list_inner = np.append(
                        score_list_inner, score_each_fold)
                else:  # metric == 'MCC'
                    y_pred_inner = model.predict(dat_test)
                    score_each_fold = matthews_corrcoef(
                        labs_test, y_pred_inner)
                    score_list_inner = np.append(
                        score_list_inner, score_each_fold)

            if score_list_inner.mean() > score_max:
                score_max = score_list_inner.mean()
                c_optimal = item

        if sampling_type == 0:
            train_outer_new = train_outer
            train_outer_labs_new = train_outer_labs
        else:
            train_outer_new, train_outer_labs_new = Sampling_SMOTE(
                train_outer, train_outer_labs)

        model = LogisticRegression(C=c_optimal, solver='liblinear')
        model.fit(train_outer_new, train_outer_labs_new)

        if metric == 'ACC':
            score_each_fold_outer = model.score(test_outer, test_outer_labs)
            score_list_outer = np.append(
                score_list_outer, score_each_fold_outer)
        else:  # metric == 'MCC'
            y_pred_outer = model.predict(test_outer)
            score_each_fold_outer = matthews_corrcoef(
                test_outer_labs, y_pred_outer)
            score_list_outer = np.append(
                score_list_outer, score_each_fold_outer)

        if save_acc_perclass:
            acc_all_class = []
            y_pred_outer = model.predict(test_outer)
            for d in range(len(keys_outer)):
                ind = np.where(test_outer_labs == keys_outer[d])
                tp = 0
                y_pred, y_true = y_pred_outer[ind], test_outer_labs[ind]
                tp = [
                    1 for kk in range(
                        len(y_pred)) if y_true[kk] == y_pred[kk]]
                tp = np.sum(tp)
                acc_perclass = np.round(
                    (tp / float(len(y_true))) * 100, decimals=2)
                acc_all_class = np.append(acc_all_class, acc_perclass)
            acc_all_class_list.append(acc_all_class)

    if save_acc_perclass:
        return score_list_outer, np.round(
            np.mean(np.array(acc_all_class_list), axis=0), decimals=2), keys_outer
    else:
        return score_list_outer


def logistic_regression(dataset, sampling_type, metric, save_acc_perclass):
    """ main function for doing classification

    Parameters
    ----------
    dataset : str
        Name of dataset
    sampling_type : int
        Type of sampling method
    metric: str
        Metric for evaluation (Accuracy or Matthews correlation coefficient)
    save_acc_perclass: bool
        a flag for whether or not this function saves accuracy per class

    """

    ROOT_PATH = os.getcwd()
    data_directory = os.path.join(ROOT_PATH, "data/", dataset)
    dict_results = {}
    files = [f for f in os.listdir(data_directory) if f.endswith(".txt")]

    if save_acc_perclass:
        namedir = 'Accuracy_perclass'
        Directory_Save = os.path.join(ROOT_PATH, namedir)
        completeName = os.path.join(Directory_Save)
        if not os.path.exists(completeName):
            os.makedirs(completeName)

        for k in range(len(files)):
            start = time.time()
            results, Acc_per_class, keys = classifying_LR_l2(
                data_directory, files[k], sampling_type, metric, save_acc_perclass)
            end = time.time()
            elapsed = ((end - start) / 60)  # in minute
            dict_results[files[k]] = [results.mean(), results.std(), elapsed]

            Acc_Feat = files[k][:-4]
            Acc_Feat = np.append(Acc_Feat, Acc_per_class)
            if (sampling_type == 0):
                name = dataset + '_accperclass_PRS.csv'
            else:
                name = dataset + '_accperclass_SMS.csv'
            with open(Directory_Save + '/' + name, 'a') as csvfile:
                fieldnames = ['Feature']
                fieldnames = np.append(fieldnames, keys)
                writer = csv.writer(csvfile)
                if k == 0:
                    writer.writerow(fieldnames)
                writer.writerow(Acc_Feat)

    else:

        for k in range(len(files)):
            start = time.time()
            results = classifying_LR_l2(
                data_directory,
                files[k],
                sampling_type,
                metric,
                save_acc_perclass)
            end = time.time()
            elapsed = ((end - start) / 60)  # in minute
            dict_results[files[k]] = [results.mean(), results.std(), elapsed]

    if sampling_type == 0:
        name = dataset + '.csv'
        namedir = metric + '_PRS'
    else:
        name = dataset + '.csv'
        namedir = metric + '_SMS'

    Directory_Save = os.path.join(ROOT_PATH, "Results/", namedir)
    completeName = os.path.join(Directory_Save)
    if not os.path.exists(completeName):
        os.makedirs(completeName)
    with open(Directory_Save + '/' + name, 'w') as csvfile:
        if metric == 'ACC':
            fieldnames = [
                'feature',
                'accuracy_mean',
                'accuracy_std',
                'elapsed_time']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            for key in dict_results.keys():
                writer.writerow({fieldnames[0]: key[:-4],
                                 fieldnames[1]: "{0:.2f}".format(dict_results[key][0] * 100),
                                 fieldnames[2]: "{0:.2f}".format(dict_results[key][1] * 100),
                                 fieldnames[3]: "{0:.4f}".format(dict_results[key][2])})
        else:  # metric == 'MCC'
            fieldnames = [
                'feature',
                'matthewscor_mean',
                'matthewscor_std',
                'elapsed_time']
            writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
            writer.writeheader()
            for key in dict_results.keys():
                writer.writerow({fieldnames[0]: key[:-4],
                                 fieldnames[1]: "{0:.2f}".format(dict_results[key][0]),
                                 fieldnames[2]: "{0:.2f}".format(dict_results[key][1]),
                                 fieldnames[3]: "{0:.4f}".format(dict_results[key][2])})
