# 2) mA bigWig # 3) mC bigWig # 4) RNA-seq bigWig # will need to import classes from a utils file import gzip import numpy as np import pyBigWig as pbw import pyfaidx import argparse from tqdm import tqdm from pathlib import Path from pybedtools import BedTool from skimage.transform import resize np.set_printoptions(suppress=True) #### ## need to split datasheets into train test val ## samples to it is easy to read in from DataLoader def main(): ''' In the context of fiber2ctcf shape data contains helper classes. All training sites are generated via bedtools / grep / awk from command line. Train/test/val split is generated by partitioning chromosomes ''' return None def init_parser(): parser = argparse.ArgumentParser(description='sm2rna process training data') # Data parser.add_argument('--sites', type=str, help='training / test data sites') # Where to store the output data parser.add_argument('--out_dir', help='out_dir for training data files') args = parser.parse_args() return args.genome, args.chromsizes, args.window, args.stride def windowGenerator(size,stride,window): end = 0 start = 0 stride_shift = int(0.8 * stride) while start < size - window: end = start + window yield start,end start_shift = np.random.choice(range(-stride_shift,0)) + stride start = start + start_shift def readChromSizes(chromsizes): chromsizes_dict = {} with open(chromsizes,'r') as handle: for line in handle: split_line = line.strip().split() chromsizes_dict[split_line[0]] = int(split_line[1]) return chromsizes_dict class GenomicFeatureSingleThread: def __init__(self, path, norm): self.path = path self.load(path) self.norm = norm def load(self, path): self.feature = self.read_feature(path) def get(self, chr_name, start, end, add_noise=False): feature = self.feature_to_npy(chr_name, start, end) feature = np.nan_to_num(feature, 0) # Important! replace nan with 0 if add_noise: feature = self.add_gaussian_noise(feature) if self.norm == True : feature = np.log(feature + 1) return feature def read_feature(self, path): # print(path) bw_file = pbw.open(path) return bw_file def feature_to_npy(self, chr_name, start, end): signals = self.feature.values(chr_name, start, end) return np.array(signals) def length(self, chr_name): return self.feature.chroms(chr_name) def add_gaussian_noise(self, inputs, std = 0.01): noise = np.random.randn(*inputs.shape) * std outputs = inputs + noise outputs[outputs < 0] = 0 return outputs class HiCFeature: def __init__(self, path = None): self.hic = self.load_hic(path) def get(self, start, window = 2097152, res = 10000): start_bin = int(start / res) range_bin = int(window / res) end_bin = start_bin + range_bin hic_mat = self.diag_to_mat(self.hic, start_bin, end_bin) return hic_mat def load_hic(self, path): return dict(np.load(path)) def resize_diag(self,mat): image_scale = 256 mat = resize(mat, (image_scale, image_scale), anti_aliasing=True) mat = np.log(mat + 1) return mat def diag_to_mat(self, ori_load, start, end): ''' Only accessing 256 x 256 region max, two loops are okay ''' square_len = end - start diag_load = {} for diag_i in range(square_len): diag_load[str(diag_i)] = ori_load[str(diag_i)][start : start + square_len - diag_i] diag_load[str(-diag_i)] = ori_load[str(-diag_i)][start : start + square_len - diag_i] start -= start end -= start diag_region = [] for diag_i in range(square_len): diag_line = [] for line_i in range(-1 * diag_i, -1 * diag_i + square_len): if line_i < 0: diag_line.append(diag_load[str(line_i)][start + line_i + diag_i]) else: diag_line.append(diag_load[str(line_i)][start + diag_i]) diag_region.append(diag_line) diag_region = np.array(diag_region).reshape(square_len, square_len) return self.resize_diag(diag_region) def __len__(self): return len(self.hic['0']) if __name__ == '__main__': main()