HPC Lab - Software - BiBench

Source code for bibench.validation.external

####################################################################
###     ____  _ ____                  _                          ###
###    | __ )(_) __ )  ___ _ __   ___| |__                       ###
###    |  _ \| |  _ \ / _ \ '_ \ / __| '_ \                      ###
###    | |_) | | |_) |  __/ | | | (__| | | |                     ###
###    |____/|_|____/ \___|_| |_|\___|_| |_|                     ###
###                                                              ###
###--------------------------------------------------------------###
###                                                              ###
### This file is part of the BiBench package for biclustering    ###
### analysis.                                                    ###
###                                                              ###
### Copyright (c) 2011 by:                                       ###
###   * Kemal Eren,                                              ###
###   * Mehmet Deveci,                                           ###
###   * Umit V. Catalyurek                                       ###
###                                                              ###
###--------------------------------------------------------------###
###                                                              ###
### For license info, please see the README and LICENSE files    ###
### in the main directory.                                       ###
###                                                              ###
###--------------------------------------------------------------###

"""
Metrics for evaluating biclusters and sets of biclusters.

"""

from __future__ import division

from collections import namedtuple

import numpy as np
import math

[docs]class ExternalError(Exception): pass
[docs]class EmptyList(ExternalError): pass
ListScore = namedtuple('ListScore', 'relevance recovery') ############################################################## #utilities for union/intersection/difference of areas # #note: different from area of union/intersection/difference ############################################################## def _union_area_(bic1, bic2): """ Returns the area of the element-wise union of biclusters. Note: this is different from bic1.union(bic2).area() Args: * bic1: Bicluster instance * bic2: Bicluster instance """ area1 = bic1.area() area2 = bic2.area() intersection_area = bic1.intersection(bic2).area() return area1 + area2 - intersection_area def _intersection_area_(bic1, bic2): """ Returns the area of the element-wise intersection of biclusters. Args: * bic1: Bicluster instance * bic2: Bicluster instance """ return bic1.intersection(bic2).area() def _difference_area_(bic1, bic2): """ Returns the element-wise area of bic1 - bic2 Note: this is different from bic1.difference(bic2).area() Args: * bic1: Bicluster instance * bic2: Bicluster instance """ return bic1.area() - bic1.intersection(bic2).area() def _area_is_nonzero_(bicluster): """ Raise an exception if bicluster's area is zero, because we need to divide by it. Args: * bicluster: Bicluster instance """ if bicluster.area() == 0: nrows, ncols = len(bicluster.rows), len(bicluster.cols) msg = 'bicluster is ill defined:' \ '{0} rows and {1} columns'.format(nrows, ncols) raise ExternalError(msg) ################################################ ### code for comparing individual biclusters ### ################################################
[docs]def recovery(expected, found): r""" The proportion of true positives found. .. math:: \frac{|e \cap f|}{|e|} Args: * expected: Bicluster. * found: Bicluster. """ _area_is_nonzero_(expected) return _intersection_area_(expected, found) / expected.area()
[docs]def relevance(expected, found, nelts=None): r""" The proportion of true negatives to all negatives. .. math:: \frac{n - |e \cup f|}{n - |e|} Args: * expected: Bicluster * found: Bicluster * nelts: the number of elements in the data matrix. This parameter can be ommitted if ``expected`` and ``found`` have their .data attribute set. """ if nelts is None: assert expected.data is not None #because we need to know negatives assert type(expected.data) == np.ndarray nelts = expected.data.size true_negs = nelts - _union_area_(expected, found) all_negs = nelts - expected.area() return true_negs / all_negs
[docs]def modified_relevance(expected, found): r""" The proportion of 'expected' retrieved in 'found'. Note that the one-way relevance is not true relevance: it is the intersection divided by the size of 'found'. One way relevance: .. math:: \frac{|e \cap f|}{|f|} Args: * expected: Bicluster * found: Bicluster """ _area_is_nonzero_(found) return _intersection_area_(expected, found) / found.area()
[docs]def f_measure(expected, found, beta=1, modified=True): r""" Calculates the f_measure score of a result of an algorithm with the expected biclusters that are embedded to the dataset. Uses relevance and recovery scores, and takes the harmonic mean of them. In the harmonic mean formula, relevance score is scaled with the square of beta as in the following formula: .. math:: (1 + \beta^2) * (spec * sens) / (\beta^2 * spec + sens) Args: * expected: Bicluster. * found: Bicluster. * beta: scale factor in the formula of f_measure. """ if modified: spec = modified_relevance(expected, found) else: spec = relevance(expected, found) sens = recovery(expected, found) if spec == sens == 0: return 0 return (1 + beta**2) * (spec * sens) / (beta**2 * spec + sens)
[docs]def row_jaccard(expected, found): """ The Jaccard coefficient of rows only. As used by Prelic - intersection_of_rows / union_of_rows Args: * expected: Bicluster * found: Bicluster """ intsn = expected.intersection(found) union = expected.union(found) if len(union.rows) == 0: return 0 return len(intsn.rows) / len(union.rows)
[docs]def jaccard(expected, found): r""" Jaccard coefficient of bicluster area. .. math:: \frac{|e \cap f|}{|e \cup f|} Args: * expected: Bicluster * found: Bicluster """ return _intersection_area_(expected, found) / _union_area_(expected, found) ############################################# ### code for comparing lists of biclusters ### #############################################
def _check_list_(blist): """ Checks if any of the expected or found bicluster lists are empty. Args: * blist: List of biclusters. """ if len(blist) == 0: raise EmptyList('list of biclusters is empty') def _asym_scores_(expected, found, rel_f, rec_f, comp=max): r""" Calculates both the relevance and the recovery scores of the algorithm result. Each bicluster score is computed according to the measure function. The overall score is computed as the mean of the best scores for each bicluster. .. math:: Recovery = \sum_{e \in expected} max_{f \in found} rec\_f(e, f) Relevance = \sum_{f \in found} max_{e \in expected} rec\_f(e, f) Args: * expected: List of target biclusters. * found: List of recovered biclusters. * rel_f: Function for calculating relevance. * rec_f: Function for calculating recovery. * comp: Either max() or min(), depending on whether rel_f and rec_f should be maximised or minimized. Returns: (recovery, relevance) tuple. """ _check_list_(expected) _check_list_(found) return ListScore(recovery=np.mean([comp([rec_f(e, f) for f in found]) for e in expected]), relevance=np.mean([comp([rel_f(e, f) for e in expected]) for f in found])) def _sym_scores_(expected, found, f, comp=max): """ Same as _asym_scores_, but f is symmetric, and so the order of the bicluster arguments does not matter. Args: * f: a function taking two biclusters and returning a score """ return _asym_scores_(expected, found, f, f)
[docs]def prelic_list(expected, found): """ Calculates both the relevance and the recovery scores of the algorithm result according to Prelics row jaccard score. Args: * expected: list of target biclusters * found: list of biclusters for comparison """ f = row_jaccard return _sym_scores_(expected, found, f)
[docs]def f_measure_list(expected, found, beta=1, modified=True): r""" Calculates both the relevance and the recovery scores of the algorithm result according to f_measure score with the given beta scale. .. math:: Recovery = \sum_{e \in expected} max_{f \in found} fmeasure(e, f) Relevance = \sum_{f \in found} max_{e \in expected} fmeasure(e, f) Args: * expected: list of target biclusters * found: list of biclusters for comparison * beta: scale factor in the formula of f_measure. """ f = lambda x, y: f_measure(x, y, beta=beta, modified=modified) return _sym_scores_(expected, found, f)
[docs]def jaccard_list(expected, found): r""" Recovery and relevance scores of a set of biclusters using Jaccard coefficient. .. math:: Recovery = \sum_{e \in expected} max_{f \in found} \frac{|e \cap f|}{|e \cup f|} Relevance = \sum_{f \in found} max_{e \in expected} \frac{|e \cap f|}{|e \cup f|} Args: * expected: list of target biclusters * found: list of biclusters for comparison """ f = jaccard return _sym_scores_(expected, found, f)
[docs]def recovery_relevance_list(expected, found, modified=True): r""" N.B.: The f-measure calculated as the harmonic mean of the recovery and relevance scores returned by this function is *different* from the f_measure_list function. .. math:: Recovery = \sum_{e \in expected} max_{f \in found} \frac{|e \cap f|}{|e|} Relevance = \sum_{f \in found} max_{e \in expected} \frac{|e \cap f|}{|f|} Args: * expected: list of target biclusters * found: list of biclusters for comparison """ return _asym_scores_(expected, found, modified_relevance, recovery)
[docs]def ecludian_distance(vector1, vector2): """ Returns the euclidian distance of the two vectors. Raises if the lengths of the vectors are not equal. Args: * vector1: 1-Dimensional list (vector) * vector1: 1-Dimensional list (vector) """ l1 = len(vector1) l2 = len(vector2) if (l1 != l2): print "Given vectors to ecludian_distance function does not have equal length." raise elif l1 == 0: print "Given vectors to ecludian_distance function have 0 length." raise else: d = 0 l = len(vector1) for i in range(l): f = vector1[i] - vector2[i] d += f * f return math.pow(d, 1/float(l1))
[docs]def calculate_distance_matrix(datamatrix, distance_function): """ Returns a matrix, Prows, that contains the information about proximity between rows. Args: * datamatrix: 2-Dimensional np array representing dataset. * distance_function: Function that calculates the distance between 2 vectors. For example ecludian_distance function. """ l = datamatrix.shape[0] Prows = np.zeros([l,l]) for ri in range(l): for rj in range(ri+1, l): d = distance_function(datamatrix[ri], datamatrix[rj]) Prows[ri][rj] = d Prows[rj][ri] = d Prows[ri][ri] = 0 return Prows
[docs]def get_distance_matrices(datamatrix, distance_function): """ Returns two matrices, Prows and Pcols, that contain the information about proximity between rows and columns respectively. Args: * datamatrix: 2-Dimensional np array representing dataset. * distance_function: Function that calculates the distance between 2 vectors. For example ecludian_distance function. """ Prows = calculate_distance_matrix(datamatrix, distance_function) Pcols = calculate_distance_matrix(datamatrix.transpose(), distance_function) return Prows, Pcols
[docs]def row_pair_count(i, j, bicluster_set): """ Returns the count of biclusters where the row_i and row_j appear together Args: * i: First row index, starting from 0. * j: Second row index, starting from 0. * bicluster_set: List of bicluster instaces. The list of found biclusters in the datamatrix. """ cnt = 0 for bic in bicluster_set: if (i in bic.rows) and (j in bic.rows): cnt += 1 return cnt
[docs]def col_pair_count(i, j, bicluster_set): """ Returns the count of biclusters where the col_i and col_j appear together Args: * i: First col index, starting from 0. * j: Second col index, starting from 0. * bicluster_set: List of bicluster instaces. The list of found biclusters in the datamatrix. """ cnt = 0 for bic in bicluster_set: if (i in bic.cols) and (j in bic.cols): cnt += 1 return cnt
[docs]def calculate_bicluster_group_matrix(datamatrix, bicluster_set): """ Returns two matrices,Crow and Ccol, that contain the information about how many times two pair of rows (and columns) are clustered together in the biclustering result. That is C_ij = 1/ (1+k), where k is the number of biclusters that row_i and row_j (col_i and col_j) appear together. Args: * datamatrix: 2-Dimensional np array representing dataset. * bicluster_set: List of bicluster instaces. The list of found biclusters in the datamatrix. """ l = datamatrix.shape[0] Crow = np.zeros([l,l]) for ri in range(l): for rj in range(ri+1, l): k = row_pair_count(ri, rj, bicluster_set) res = 1 / (1.0 + k) Crow[ri][rj] = res Crow[rj][ri] = res Crow[ri][ri] = 0 colM = datamatrix.transpose() l = colM.shape[0] Ccol = np.zeros([l,l]) for ri in range(l): for rj in range(ri+1, l): k = col_pair_count(ri, rj, bicluster_set) res = 1 / (1.0 + k) Ccol[ri][rj] = res Ccol[rj][ri] = res Ccol[ri][ri] = 0 return Crow, Ccol
[docs]def get_upperHalf(C): """ Returns a single dimensional array containing the values at upper half of the matrix C. Args: * C: 2-Dimensional square np array. Simply result of calculate_bicluster_group_matrix function. """ res = [] l = C.shape[0] for ri in range(l): for rj in range(ri+1, l): res.append( C[ri][rj]) return res
[docs]def hubert_statistics(P,C): """ Returns a score that is calculated using internal indices. Uses hubert_statistics explained in R. Santamaria et. al 2007. Args: * P: 2-Dimensional squared np array. A result of get_distance_matrices function. * C: 2-Dimensional square np array. A result of calculate_bicluster_group_matrix function. """ #import pdb; pdb.set_trace() l = P.shape[0] Pupper = get_upperHalf(P) Cupper = get_upperHalf(C) Mp = np.mean(Pupper) Mc = np.mean(Cupper) squared_variance_p = math.sqrt(np.var(Pupper)) squared_variance_c = math.sqrt(np.var(Cupper)) score = 0 for ri in range(l): for rj in range(ri+1, l): score += (P[ri][rj] - Mp) * (C[ri][rj] - Mc) #import pdb; pdb.set_trace() if score == 0: return 0 score /= l * (l-1) / 2.0 score /= (squared_variance_p * squared_variance_c) return score
[docs]def internal_index_score(datamatrix, bicluster_set): """ Returns the weighted hubert_statistics scores of 2 scores calculated for columns and rows.(R. Santamaria et. al 2007 section 3.3) Args: * datamatrix: 2-Dimensional np array representing dataset. * bicluster_set: List of bicluster instaces. The list of found biclusters in the datamatrix. """ #import pdb; pdb.set_trace() Prows, Pcols = get_distance_matrices(datamatrix,ecludian_distance) Crows, Ccols = calculate_bicluster_group_matrix(datamatrix, bicluster_set) row_score = hubert_statistics(Prows, Crows) col_score = hubert_statistics(Pcols, Ccols) row_cnt = datamatrix.shape[0] col_cnt = datamatrix.shape[1] internal_score = (row_score * row_cnt + col_score * col_cnt) / (row_cnt + col_cnt) return internal_score