"""
############################
utilities (``sa.utilities``)
############################
This script reads CSV files with extracted data.
Assumed file format:
* sequence number
* plate
* date
* row
* column
* ... (computational features -- those that can be interpreted as countable real numbers)
* ORF
"""
import numpy as np
import os
import methods
[docs]def read(dir_path = None, *files):
"""
Read files and return for each file a list of data. Header incl.
Return a list with entries describing files. Each entry is a of format ((file_name, attr_names), plate_data).
"""
if dir_path != None:
files = os.listdir(dir_path)
print "Reading data files from path: %s" % dir_path
meta = []
plates = []
for fname in files:
f = open(dir_path + fname, "r")
names = f.readline().strip().split(",")
data = [line.strip().split(",") for line in f]
meta.append((fname, names))
plates.append(data)
f.close()
print "Read files: %s" % "\n\t".join(files)
return meta, plates
[docs]def read_repl(file_path, keys = ["RT", "37"]):
"""
Read file with repeating mutants in csv format with attributes [ORF, plate, row, column]. Header incl.
:param file_path: Full file path to CSV file with information on replicates.
:type file_path: `str`
:param keys: Names of TS (temperature sensitive mutants) plates' extensions. By default, these are RT and 37.
:type keys: `list`
Return a list where each entry is one repeating mutant.
"""
f = open(file_path, "r")
names = f.readline().strip().split(",")
data = [line.strip().split(",") for line in f]
data_k = []
for (orf, pn, r, c) in data:
if "TS" in pn:
data_k.extend([(orf, pn + "-" + key, r, c) for key in keys])
else:
data_k.append((orf, pn, r, c))
f.close()
return names, data_k
[docs]def std_prep(data_del, data_ts, data_sg, out_dir, wt_attr_name = "ORF", wt_name = ["YOR202W"]):
"""
Standard preprocessing; (1) standardize WT strains in each plate and remove outiers,
(2) standardize mutant strains, (3) combine computational and
non-computational features from all plates.
Save preprocessed data in Orange format to directory :param:`out_dir` named `preprocessed_del_ts_sg.tab`.
Save preprocessed data in CSV to directory :param:`out_dir` named `preprocessed_del_ts_sg.csv`.
:param data_del: Deletion collection plates data as returned from :mod:`utilities.read`.
:type data_del: `tuple` (meta_data, plates_data)
:param data_ts: TS collection plates data as returned from :mod:`utilities.read`.
:type data_ts: `tuple` (meta_data, plates_data)
:param data_sg: SG collection plates data as returned from :mod:`utilities.read`.
:type data_sg: `tuple` (meta_data, plates_data)
:param out_dir: Full path to directory where data in Orange format is saved.
:type out_dir: `str`
:param wt_attr_name: Identifier of attribute that contains ORFs.
:type wt_attr_name: `str`
:param wt_name: Names of the wild-type ORFs.
:type wt_name: `list`
Return preprocessed computational profiles and plates data.
.. seealso:: See also functions :func:`sa.methods.standardize` and :func:`sa.methods.detect_outliers`.
"""
plates_wt, dnp_wtc = [], []
plates_mt, dnp_mtc = [], []
for coll in [data_del, data_ts, data_sg]:
for meta, plate in zip(coll[0], coll[1]):
wt, mt = split_WT_MT(meta, plate, wt_attr_name, wt_name)
#WT
dnp_wt = data2np(wt, skip_first = 5, skip_last = 1)
dnp_wt = methods.standardize(dnp_wt)
_, (plate_wtr, dnp_wtr) = methods.detect_outliers(dnp_wt, wt, out_name = None, save = False)
#MUTANTS
dnp_mt = data2np(mt, skip_first = 5, skip_last = 1)
dnp_mt = methods.standardize(dnp_mt)
#save
plates_wt.extend(plate_wtr)
dnp_wtc.extend(dnp_wtr)
plates_mt.extend(mt)
dnp_mtc.extend(dnp_mt)
dnp_mtc = np.array(dnp_mtc)
dnp_wtc = np.array(dnp_wtc)
print "No. observations of mutant strains: %d" % len(plates_mt)
print "No. observations of wild-type strains: %d" % len(plates_wt)
dnp = np.vstack((dnp_wtc, dnp_mtc))
plates = plates_wt + plates_mt
assert dnp.shape[0] == len(plates), "The number of observations in computational and plate data do not match."
#save to Orange format data used in analysis
attrs = data_del[0][0][1]
to_orange(attrs, plates, dnp, out_name = out_dir + "preprocessed_del_ts_sg", noncomp_first = 5, noncomp_last = 1)
to_csv(attrs, plates, dnp, out_name = out_dir + "preprocessed_del_ts_sg", noncomp_first = 5, noncomp_last = 1)
return dnp, plates
[docs]def data2np(data, skip_first = 5, skip_last = 1):
"""Convert data to numpy array for further analysis skipping some non-computational features."""
dnp = np.zeros((len(data), len(data[0]) - skip_first - skip_last))
for i, row in enumerate(data):
dnp[i, :] = map(lambda x: float(x) if x != "" else 0.0, row[skip_first:-skip_last])
print "Converted plate data to numpy array of computational features of shape: %s" % str(dnp.shape)
return dnp
[docs]def filter_attribute(attrs, data, attr_name = "ORF", attr_values = ["YOR202W"]):
"""Filter data by selecting only rows whose column specified in :param:`fname` match values in :param:`fvalues`."""
nidx = attrs.index(attr_name)
filtered = [row for row in data if row[nidx] in attr_values]
print "No. filtered observations: %d" % len(filtered)
return filtered
[docs]def split_WT_MT(meta, data, wt_mt_name = "ORF", wt_name = ["YOR202W"]):
"""Split plate data to two groups: (i) wild-type, (ii) mutants. Wild-type strains are in entire border."""
fname, attrs = meta
wt = []
mt = []
print "File: %s" % fname
nidx = attrs.index(wt_mt_name)
for row in data:
if row[nidx] in wt_name:
wt.append(row)
else:
mt.append(row)
print "No. wild-type observations: %d" % len(wt)
print "No. mutant observations: %d" % len(mt)
return wt, mt
[docs]def combine(meta, dataL):
"""Combine strains data from different plates into one set, add attribute explaining plate membership for observation."""
file_nameL, attrL = zip(*meta)
print "Joining data from files: %s" % str(file_nameL)
new_data = []
for file_name, data in zip(file_nameL, dataL):
new_data.extend([ [file_name[:-4] + "__" + row[0]] + row[1:] for row in data])
meta = ("joined_plates", attrL[0])
pp_plate_info(meta, new_data)
return meta, new_data
[docs]def plates2dict(plate, dnp):
"""Return dictionary from :param:`plate` indexed by plate identifier, row and column number."""
# el[1] -- plate number
# el[3] -- row number
# el[4] -- column number
assert len(plate) == dnp.shape[0], "Dimension mismatch."
return {(el[1], el[3], el[4]): dnp[i,:] for i, el in enumerate(plate)}
[docs]def pp_plate_info(meta, plate):
file_name, attrs = meta
print "Plate: %s" % file_name
top = int(0.05 * len(attrs))
print "No. features: %d. These are: %s" % (len(attrs), ", ".join(attrs[:top]) + " ... " + attrs[-1])
print "No. observations: %d" % len(plate)
[docs]def to_orange(names, data_org, dnp, out_name, noncomp_first = 5, noncomp_last = 1):
"""Save data in Orange format. Non-computational features are stored as meta attributes."""
assert len(names) == len(data_org[0]), "The number of feature names and values do not match."
assert len(data_org) == dnp.shape[0], "The number of observations for comp. and noncomp. features do not match."
f = open(out_name + "_orange.tab", "w")
f.write("%s\n" % "\t".join(names))
comp = len(names) - noncomp_first - noncomp_last
f.write("%s\n" % "\t".join(["d" for _ in xrange(noncomp_first)] + ["c" for _ in xrange(comp)] + ["d" for _ in xrange(noncomp_last)]))
f.write("%s\n" % "\t".join(["meta" for _ in xrange(noncomp_first)] + ["" for _ in xrange(comp)] + ["meta" for _ in xrange(noncomp_last)]))
for i, r in enumerate(data_org):
f.write("%s\n" % "\t".join(r[:noncomp_first + 1] + map(str, list(dnp[i, 1:])) + r[-noncomp_last:]))
f.close()
print "Data saved in Orange format to: %s_orange.tab" % out_name
[docs]def to_csv(names, data_org, dnp, out_name, noncomp_first = 5, noncomp_last = 1):
"""Save data in CSV format."""
assert len(names) == len(data_org[0]), "The number of feature names and values do not match."
assert len(data_org) == dnp.shape[0], "The number of observations for comp. and noncomp. features do not match."
f = open(out_name + "_prep.csv", "w")
f.write("%s\n" % ",".join(names))
comp = len(names) - noncomp_first - noncomp_last
for i, r in enumerate(data_org):
f.write("%s\n" % ",".join(r[:noncomp_first + 1] + map(str, list(dnp[i, 1:])) + r[-noncomp_last:]))
f.close()
print "Data saved in CSV format to: %s_prep.csv" % out_name