HPC Lab - Software - BiBench

Source code for bibench.datasets.io

####################################################################
###     ____  _ ____                  _                          ###
###    | __ )(_) __ )  ___ _ __   ___| |__                       ###
###    |  _ \| |  _ \ / _ \ '_ \ / __| '_ \                      ###
###    | |_) | | |_) |  __/ | | | (__| | | |                     ###
###    |____/|_|____/ \___|_| |_|\___|_| |_|                     ###
###                                                              ###
###--------------------------------------------------------------###
###                                                              ###
### This file is part of the BiBench package for biclustering    ###
### analysis.                                                    ###
###                                                              ###
### Copyright (c) 2011 by:                                       ###
###   * Kemal Eren,                                              ###
###   * Mehmet Deveci,                                           ###
###   * Umit V. Catalyurek                                       ###
###                                                              ###
###--------------------------------------------------------------###
###                                                              ###
### For license info, please see the README and LICENSE files    ###
### in the main directory.                                       ###
###                                                              ###
###--------------------------------------------------------------###

"""Utilities for reading and writing datasets for various algorithms."""

from __future__ import division
import numpy as np

[docs]def write_expression_data(data, filename, sep='\t', genes=None, conditions=None): """Writes a dataset in the following relatively standard format:: Genes/Conditions [col ID] [col ID] ... [col ID] [row ID] [value] [value] ... [value] [row ID] [value] [value] ... [value] ... [row ID] [value] [value] ... [value] Args: * data: numpy.ndarray * filename: Output file name. * sep: Seperating character, e.g. ' ' or ','. * genes: Optional list of row labels. * conditions: Optional list of column labels. """ outfile = file(filename, 'w') nrows, ncols = data.shape if genes is None: genes = ['row{0}'.format(i) for i in range(nrows)] else: assert len(genes) == nrows if conditions is None: conditions = ['col{0}'.format(i) for i in range(ncols)] else: assert len(conditions) == ncols outfile.write("Genes/Conditions") outfile.write(sep) outfile.write("\t".join(conditions)) for i, gene in enumerate(genes): outfile.write("\n") outfile.write(gene) outfile.write(sep) outfile.write(sep.join([str(j) for j in data[i]])) outfile.close()
[docs]class ExpressionArray(np.ndarray): """ A numpy array with extra attributes 'genes', 'samples', and 'annotation'. adapted from http://docs.scipy.org/doc/numpy/user/basics.subclassing.html """ def __new__(cls, input_array, genes=None, samples=None, annotation=None): obj = np.asarray(input_array).view(cls) obj.genes = genes obj.samples=samples obj.annotation=annotation return obj def __array_finalize__(self, obj): if obj is None: return self.genes = getattr(obj, 'genes', None) self.samples = getattr(obj, 'samples', None) self.annotation = getattr(obj, 'annotation', None) def __reduce__(self): object_state = list(np.ndarray.__reduce__(self)) subclass_state = (self.genes, self.samples, self.annotation) object_state[2] = (object_state[2], subclass_state) return tuple(object_state) def __setstate__(self, state): nd_state, own_state = state np.ndarray.__setstate__(self, nd_state) self.genes, self.samples, self.annotation = own_state #TODO: refactor to use labelled arrays or numpy's DataArrays
[docs]def read_expression_data(filename, skip_header=1, strip_chars=None): """ Read a tsv file with the same format written by write_expression_data(). Args: * filename: * skip_header: * strip_chars: Returns: An instance of ExpressionArray. """ genes = list(np.genfromtxt(filename, skip_header=skip_header, usecols=[0], dtype=np.character)) data = np.genfromtxt(filename, skip_header=skip_header)[:,1:] headers = [] if skip_header > 0: headers = open(filename).read().split('\n')[0:skip_header] headers = [line.split() for line in headers] if strip_chars is not None: genes = [g.strip(strip_chars) for g in genes] headers = [[h.strip(strip_chars) for h in header] for header in headers] if len(headers[0]) == data.shape[1] + 1: headers = [h[1:] for h in headers] #remove top-left header, if present. return ExpressionArray(data, genes, headers)
[docs]def write_bicoverlapper(bicluster_sets, filename, rownames=None, colnames=None): """ Writes every bicluster in a list of lists of biclusters to a file in the format read by BicOverlapper. Args: * bicluster_sets: a list of list of biclusters that will be written to the file. * filename: output file name. File format:: [number_of_biclusters] bicluster set 1 #rows bic1.1 #columns bic1.1 row1 row2 ... rowN col1 col2 ... colN #rows bic1.2 #columns bic1.2 row1 row2 ... rowN col1 col2 ... colN ... bicluster set 2 #rows bic2.1 #columns bic2.1 row1 row2 ... rowN col1 col2 ... colN #rows bic2.2 #columns bic2.2 row1 row2 ... rowN col1 col2 ... colN ... """ data = bicluster_sets[0][0].data for biclusters in bicluster_sets: for b in biclusters: if not id(b.data) == id(data): raise Exception('Bicluster data is not identical.') if rownames is None: rownames = ['row{0}'.format(i) for i in range(data.shape[0])] if colnames is None: colames = ['col{0}'.format(j) for j in range(data.shape[1])] outfile = file(filename, 'w') outfile.write("{0}\n".format(sum([len(b) for b in bicluster_sets]))) for i, biclusters in enumerate(bicluster_sets): outfile.write("bicluster_set_{0}\n".format(i)) for j, bicluster in enumerate(biclusters): outfile.write("#rows_bic{0}.{1} #columns_bic{0}.{1}\n".format(i, j)) outfile.write(" ".join([rownames[r] for r in bicluster.rows])) outfile.write("\n") outfile.write(" ".join([colnames[c] for c in bicluster.cols])) outfile.write("\n") outfile.close()
[docs]def write_pcl_dataset(data, filename): """ Given the pure numpy data matrix with only expression values, converts the data matrix into PCL format with default row and column names. Args: * data: numpy.ndarray * filename: output file name. """ nrows, ncols = data.shape line1 = ["ID", "NAME", "GWEIGHT"] line1.extend(range(ncols)) line2 = ["EWEIGHT", "", "",] line2.extend([1] * ncols) pcllist = [line1, line2] for i, line in enumerate(data): pclline = [i, '', 1] pclline.extend(line) pcllist.append(pclline) pcl = ' \n'.join(['\t'.join(map(str, line)) for line in pcllist]) with open(filename, 'w') as f: f.write(pcl)
[docs]def write_david_multilist(filename, gene_lists, name=None): """ Writes a DAVID multilist, with each list in one column. The first row gives the name of the list, which is just 'name#'. The gene names must be the same for each bicluster """ if name is None: name = "unnamed_list" longest = max([len(i) for i in gene_lists]) def extend(gene_list, length): gl = gene_list[:] gl.extend(['' for i in range(length - len(gl))]) return gl gene_lists = [extend(g, longest) for g in gene_lists] lines = [[gene_lists[i][j] for i in range(len(gene_lists))] for j in range(longest)] with open(filename, 'w') as f: f.write('\t'.join([name + str(i) for i in range(len(gene_lists))])) f.write('\n') for line in lines: f.write('\t'.join(line)) f.write('\n')
[docs]def write_david_list(filename, gene_list): """ Write a DAVID (http://david.abcc.ncifcrf.gov/) list of genes. """ with open(filename, 'w') as f: f.write('\n'.join(gene_list))