# 2) mA bigWig
# 3) mC bigWig
# 4) RNA-seq bigWig
# will need to import classes from a utils file


import gzip
import numpy as np
import pyBigWig as pbw
import pyfaidx
import argparse 
from tqdm import tqdm
from pathlib import Path
from pybedtools import BedTool
from skimage.transform import resize

np.set_printoptions(suppress=True)

#### 
## need to split datasheets into train test val 
## samples to it is easy to read in from DataLoader

def main():
	''' In the context of fiber2ctcf shape data contains 
		helper classes. All training sites are generated via bedtools / grep / awk from 
		command line. Train/test/val split is generated by partitioning chromosomes '''
	return None


def init_parser(): 
	
	parser = argparse.ArgumentParser(description='sm2rna process training data')

	# Data 
	parser.add_argument('--sites', type=str, help='training / test data sites')

	# Where to store the output data
	parser.add_argument('--out_dir', help='out_dir for training data files')
	args = parser.parse_args()


	return args.genome, args.chromsizes, args.window, args.stride


def windowGenerator(size,stride,window):

	end = 0
	start = 0
	
	stride_shift = int(0.8 * stride)


	while start < size - window:

		end = start + window

		yield start,end

		start_shift = np.random.choice(range(-stride_shift,0)) + stride

		start = start + start_shift

def readChromSizes(chromsizes):

	chromsizes_dict = {}

	with open(chromsizes,'r') as handle:
		for line in handle:
			split_line = line.strip().split()

			chromsizes_dict[split_line[0]] = int(split_line[1])

	return chromsizes_dict


class GenomicFeatureSingleThread:

	def __init__(self, path, norm):
		self.path = path
		self.load(path)
		self.norm = norm

	def load(self, path):
		self.feature = self.read_feature(path)

	def get(self, chr_name, start, end, add_noise=False):

		feature = self.feature_to_npy(chr_name, start, end)
		feature = np.nan_to_num(feature, 0) # Important! replace nan with 0

		if add_noise:
			feature = self.add_gaussian_noise(feature)

		if self.norm == True :

			feature = np.log(feature + 1)

		return feature

	def read_feature(self, path):
#		print(path)
		bw_file = pbw.open(path)
		return bw_file

	def feature_to_npy(self, chr_name, start, end):
		signals = self.feature.values(chr_name, start, end)
		return np.array(signals)

	def length(self, chr_name):
		return self.feature.chroms(chr_name)

	def add_gaussian_noise(self, inputs, std = 0.01):
		noise = np.random.randn(*inputs.shape) * std
		outputs = inputs + noise
		outputs[outputs < 0] = 0 
		return outputs

class HiCFeature:

	def __init__(self, path = None):
		self.hic = self.load_hic(path)

	def get(self, start, window = 2097152, res = 10000):
		start_bin = int(start / res)
		range_bin = int(window / res)
		end_bin = start_bin + range_bin
		hic_mat = self.diag_to_mat(self.hic, start_bin, end_bin)
		return hic_mat

	def load_hic(self, path):
		return dict(np.load(path))
	
	def resize_diag(self,mat):
		image_scale = 256
		mat = resize(mat, (image_scale, image_scale), anti_aliasing=True)
		mat = np.log(mat + 1)
		return mat
	
	def diag_to_mat(self, ori_load, start, end):
		'''
		Only accessing 256 x 256 region max, two loops are okay
		'''
		square_len = end - start
		diag_load = {}
		for diag_i in range(square_len):
			diag_load[str(diag_i)] = ori_load[str(diag_i)][start : start + square_len - diag_i]
			diag_load[str(-diag_i)] = ori_load[str(-diag_i)][start : start + square_len - diag_i]
		start -= start
		end -= start

		diag_region = []
		for diag_i in range(square_len):
			diag_line = []
			for line_i in range(-1 * diag_i, -1 * diag_i + square_len):
				if line_i < 0:
					diag_line.append(diag_load[str(line_i)][start + line_i + diag_i])
				else:
					diag_line.append(diag_load[str(line_i)][start + diag_i])
			diag_region.append(diag_line)
		diag_region = np.array(diag_region).reshape(square_len, square_len)
		return self.resize_diag(diag_region)
	

	def __len__(self):
		return len(self.hic['0'])

if __name__ == '__main__':
    main()