Source code for sa.utilities

"""
############################
utilities (``sa.utilities``)
############################

This script reads CSV files with extracted data.

Assumed file format: 
    * sequence number
    * plate
    * date
    * row 
    * column 
    * ... (computational features -- those that can be interpreted as countable real numbers)
    * ORF
        
"""
import numpy as np
import os

import methods

[docs]def read(dir_path = None, *files): """ Read files and return for each file a list of data. Header incl. Return a list with entries describing files. Each entry is a of format ((file_name, attr_names), plate_data). """ if dir_path != None: files = os.listdir(dir_path) print "Reading data files from path: %s" % dir_path meta = [] plates = [] for fname in files: f = open(dir_path + fname, "r") names = f.readline().strip().split(",") data = [line.strip().split(",") for line in f] meta.append((fname, names)) plates.append(data) f.close() print "Read files: %s" % "\n\t".join(files) return meta, plates
[docs]def read_repl(file_path, keys = ["RT", "37"]): """ Read file with repeating mutants in csv format with attributes [ORF, plate, row, column]. Header incl. :param file_path: Full file path to CSV file with information on replicates. :type file_path: `str` :param keys: Names of TS (temperature sensitive mutants) plates' extensions. By default, these are RT and 37. :type keys: `list` Return a list where each entry is one repeating mutant. """ f = open(file_path, "r") names = f.readline().strip().split(",") data = [line.strip().split(",") for line in f] data_k = [] for (orf, pn, r, c) in data: if "TS" in pn: data_k.extend([(orf, pn + "-" + key, r, c) for key in keys]) else: data_k.append((orf, pn, r, c)) f.close() return names, data_k
[docs]def std_prep(data_del, data_ts, data_sg, out_dir, wt_attr_name = "ORF", wt_name = ["YOR202W"]): """ Standard preprocessing; (1) standardize WT strains in each plate and remove outiers, (2) standardize mutant strains, (3) combine computational and non-computational features from all plates. Save preprocessed data in Orange format to directory :param:`out_dir` named `preprocessed_del_ts_sg.tab`. Save preprocessed data in CSV to directory :param:`out_dir` named `preprocessed_del_ts_sg.csv`. :param data_del: Deletion collection plates data as returned from :mod:`utilities.read`. :type data_del: `tuple` (meta_data, plates_data) :param data_ts: TS collection plates data as returned from :mod:`utilities.read`. :type data_ts: `tuple` (meta_data, plates_data) :param data_sg: SG collection plates data as returned from :mod:`utilities.read`. :type data_sg: `tuple` (meta_data, plates_data) :param out_dir: Full path to directory where data in Orange format is saved. :type out_dir: `str` :param wt_attr_name: Identifier of attribute that contains ORFs. :type wt_attr_name: `str` :param wt_name: Names of the wild-type ORFs. :type wt_name: `list` Return preprocessed computational profiles and plates data. .. seealso:: See also functions :func:`sa.methods.standardize` and :func:`sa.methods.detect_outliers`. """ plates_wt, dnp_wtc = [], [] plates_mt, dnp_mtc = [], [] for coll in [data_del, data_ts, data_sg]: for meta, plate in zip(coll[0], coll[1]): wt, mt = split_WT_MT(meta, plate, wt_attr_name, wt_name) #WT dnp_wt = data2np(wt, skip_first = 5, skip_last = 1) dnp_wt = methods.standardize(dnp_wt) _, (plate_wtr, dnp_wtr) = methods.detect_outliers(dnp_wt, wt, out_name = None, save = False) #MUTANTS dnp_mt = data2np(mt, skip_first = 5, skip_last = 1) dnp_mt = methods.standardize(dnp_mt) #save plates_wt.extend(plate_wtr) dnp_wtc.extend(dnp_wtr) plates_mt.extend(mt) dnp_mtc.extend(dnp_mt) dnp_mtc = np.array(dnp_mtc) dnp_wtc = np.array(dnp_wtc) print "No. observations of mutant strains: %d" % len(plates_mt) print "No. observations of wild-type strains: %d" % len(plates_wt) dnp = np.vstack((dnp_wtc, dnp_mtc)) plates = plates_wt + plates_mt assert dnp.shape[0] == len(plates), "The number of observations in computational and plate data do not match." #save to Orange format data used in analysis attrs = data_del[0][0][1] to_orange(attrs, plates, dnp, out_name = out_dir + "preprocessed_del_ts_sg", noncomp_first = 5, noncomp_last = 1) to_csv(attrs, plates, dnp, out_name = out_dir + "preprocessed_del_ts_sg", noncomp_first = 5, noncomp_last = 1) return dnp, plates
[docs]def data2np(data, skip_first = 5, skip_last = 1): """Convert data to numpy array for further analysis skipping some non-computational features.""" dnp = np.zeros((len(data), len(data[0]) - skip_first - skip_last)) for i, row in enumerate(data): dnp[i, :] = map(lambda x: float(x) if x != "" else 0.0, row[skip_first:-skip_last]) print "Converted plate data to numpy array of computational features of shape: %s" % str(dnp.shape) return dnp
[docs]def filter_attribute(attrs, data, attr_name = "ORF", attr_values = ["YOR202W"]): """Filter data by selecting only rows whose column specified in :param:`fname` match values in :param:`fvalues`.""" nidx = attrs.index(attr_name) filtered = [row for row in data if row[nidx] in attr_values] print "No. filtered observations: %d" % len(filtered) return filtered
[docs]def split_WT_MT(meta, data, wt_mt_name = "ORF", wt_name = ["YOR202W"]): """Split plate data to two groups: (i) wild-type, (ii) mutants. Wild-type strains are in entire border.""" fname, attrs = meta wt = [] mt = [] print "File: %s" % fname nidx = attrs.index(wt_mt_name) for row in data: if row[nidx] in wt_name: wt.append(row) else: mt.append(row) print "No. wild-type observations: %d" % len(wt) print "No. mutant observations: %d" % len(mt) return wt, mt
[docs]def combine(meta, dataL): """Combine strains data from different plates into one set, add attribute explaining plate membership for observation.""" file_nameL, attrL = zip(*meta) print "Joining data from files: %s" % str(file_nameL) new_data = [] for file_name, data in zip(file_nameL, dataL): new_data.extend([ [file_name[:-4] + "__" + row[0]] + row[1:] for row in data]) meta = ("joined_plates", attrL[0]) pp_plate_info(meta, new_data) return meta, new_data
[docs]def plates2dict(plate, dnp): """Return dictionary from :param:`plate` indexed by plate identifier, row and column number.""" # el[1] -- plate number # el[3] -- row number # el[4] -- column number assert len(plate) == dnp.shape[0], "Dimension mismatch." return {(el[1], el[3], el[4]): dnp[i,:] for i, el in enumerate(plate)}
[docs]def pp_plate_info(meta, plate): file_name, attrs = meta print "Plate: %s" % file_name top = int(0.05 * len(attrs)) print "No. features: %d. These are: %s" % (len(attrs), ", ".join(attrs[:top]) + " ... " + attrs[-1]) print "No. observations: %d" % len(plate)
[docs]def to_orange(names, data_org, dnp, out_name, noncomp_first = 5, noncomp_last = 1): """Save data in Orange format. Non-computational features are stored as meta attributes.""" assert len(names) == len(data_org[0]), "The number of feature names and values do not match." assert len(data_org) == dnp.shape[0], "The number of observations for comp. and noncomp. features do not match." f = open(out_name + "_orange.tab", "w") f.write("%s\n" % "\t".join(names)) comp = len(names) - noncomp_first - noncomp_last f.write("%s\n" % "\t".join(["d" for _ in xrange(noncomp_first)] + ["c" for _ in xrange(comp)] + ["d" for _ in xrange(noncomp_last)])) f.write("%s\n" % "\t".join(["meta" for _ in xrange(noncomp_first)] + ["" for _ in xrange(comp)] + ["meta" for _ in xrange(noncomp_last)])) for i, r in enumerate(data_org): f.write("%s\n" % "\t".join(r[:noncomp_first + 1] + map(str, list(dnp[i, 1:])) + r[-noncomp_last:])) f.close() print "Data saved in Orange format to: %s_orange.tab" % out_name
[docs]def to_csv(names, data_org, dnp, out_name, noncomp_first = 5, noncomp_last = 1): """Save data in CSV format.""" assert len(names) == len(data_org[0]), "The number of feature names and values do not match." assert len(data_org) == dnp.shape[0], "The number of observations for comp. and noncomp. features do not match." f = open(out_name + "_prep.csv", "w") f.write("%s\n" % ",".join(names)) comp = len(names) - noncomp_first - noncomp_last for i, r in enumerate(data_org): f.write("%s\n" % ",".join(r[:noncomp_first + 1] + map(str, list(dnp[i, 1:])) + r[-noncomp_last:])) f.close() print "Data saved in CSV format to: %s_prep.csv" % out_name