#!/usr/bin/env ruby

require_relative "params_reader.rb"

########################
# helper functions
########################
def get_score_feature_key()
	return "ConversionEventCount"
#	return "ReadCount"
end

def get_sequence_feature_key(use_groups)
	if(use_groups != "yes")
		return "ClusterSequence"
	else
		return "GroupSequence"
	end
end

def log2(x)
	return Math.log(x) / Math.log(2)
end

def pad_X(seq, max_len)
  len = max_len - seq.length
  out_seq = seq
  (0...len).each do |i|
    out_seq<<'X'
  end
  return out_seq
end

def find_max_len(data, sequence_feature_key)
  max_len = 0
  data.each do |k, v|
    len = v[sequence_feature_key].strip.length
    if(max_len < len)
      max_len = len
    end
  end
  return max_len
end

def get_min_value_of_score_feature(data)
	min = 0
	first = true
	data.each do |k, v|
		score_feature_key = get_score_feature_key()
		score_feature = v[score_feature_key].to_f
		if(min > score_feature or first == true)
			min = score_feature
			first = false
		end
	end
	return min
end

def create_output_params_file(out_pars_file, pars)
	print "out_pars_file: #{out_pars_file}\n"
	f_out= File.open(out_pars_file, "w")
	text = "PARAMETER values:\n"
	text<< "-----------------\n"
	text<<pars.keys.collect{|k| "#{k} = #{pars[k].inspect}"}.join("\n")+"\n"
	f_out.write(text)
	f_out.close()
	text<<"---------------------------\n"
	print text
end
########################
# helper functions
########################
def parse_cluster_file_line(line)
	arr = line.strip.split(",")
	h = {"Chromosome"=>arr[0], "Strand"=>arr[1], "ClusterStart"=>arr[2].to_i, "ClusterEnd"=>arr[3].to_i, "ClusterID"=>arr[4] , 
		"ClusterSequence"=>arr[5], "ReadCount"=>arr[6].to_i, "ModeLocation"=>arr[7].to_i, "ModeScore"=>arr[8].to_f, 
		"ConversionLocationCount"=>arr[9].to_i, "ConversionEventCount"=>arr[10].to_i, "NonConversionEventCount"=>arr[11].to_i, 
		"conversions_per_read" =>arr[10].to_f / arr[6].to_f, "FilterType"=>arr[12],
		"TranscriptLocation"=>arr[13], "TranscriptID"=>arr[14], "GeneName"=>arr[15]}
	return h
end

#Chromosome,Strand,GroupStart,GroupEnd,GroupID,GroupSequence,ReadCount,FilterType,TranscriptLocation,TranscriptID,GeneName
def parse_groups_file_line(line)
	arr = line.strip.split(",")
	h = {"Chromosome"=>arr[0], "Strand"=>arr[1], "GroupStart"=>arr[2].to_i, "GroupEnd"=>arr[3].to_i, "GroupID"=>arr[4] , 
		"GroupSequence"=>arr[5], "ReadCount"=>arr[6].to_i, "ConversionLocationCount"=>arr[7].to_i, "ConversionEventCount"=>arr[8].to_i, 
		"FilterType"=>arr[9], "TranscriptLocation"=>arr[10], "TranscriptID"=>arr[11], "GeneName"=>arr[12]}
	return h
end

def pass_filter(h, filter_params)
	pass_flag=true
	filter_params.each do |key, val|
		negate_comparison_flag = false
		value=val.to_s
		if(h[key] == nil)

			print "ERROR: required filter key is not part of extracted cluster features!\n"
			print "loc_key= #{key}\n"
			exit
		end

		if(key == "ConversionEventCount")
			if(h[key].to_i < val.to_i)
				pass_flag=false
				break
			end
		elsif(key == "TranscriptLocation")
			failed=true
			val.split("+").each do |loc|
				if( h[key].to_s == loc.to_s )
					failed=false
					break
				end
			end
			if(failed == true)
				pass_flag = false
			end
		elsif(key == "ReadCount")
			if(h[key].to_i < val.to_i)
				pass_flag = false
			end
		else
			if(h[key].to_s != val.to_s)
				pass_flag = false
			end
		end
		if(pass_flag == false)
			break
		end
	end
	return pass_flag
end


def generate_TranscriptLocation_feature(entry, feature_cols_to_extract)
	# categories: {3UTR,CDS,5UTR,intron,other}, default category: other
	categories = ["3UTR","CDS","5UTR","intron"]
	transcriptLocation = [0,0,0,0]
	if(entry["TranscriptLocation"] == "3UTR")
		transcriptLocation = "1\t0\t0\t0"#[1,0,0,0]
	elsif(entry["TranscriptLocation"] == "CDS")
		transcriptLocation = "0\t1\t0\t0"#[0,1,0,0]
	elsif(entry["TranscriptLocation"] == "5UTR")
		transcriptLocation = "0\t0\t1\t0"#[0,0,1,0]
	elsif(entry["TranscriptLocation"] == "intron")
		transcriptLocation = "0\t0\t0\t1"#[0,0,0,1]
	end
	return [transcriptLocation, categories]
end


def extract_features_single_entry(entry, feature_cols_to_extract, log_transform)
	transcriptLocation, transcriptLocation_categories = generate_TranscriptLocation_feature(entry, feature_cols_to_extract)
	strand = "1"
	if(entry["Strand"] == "-")
		strand = "0"
	end
	geneName = "1"
	if(entry["GeneName"] == "NA")
		geneName = "0"
	end
	transcriptID = "1"
	if(entry["TranscriptID"] == "NA")
		transcriptID = "0"
	end
	filterType = "1"
	if(entry["FilterType"] != "NA")
		filterType = "0"
	end
	h = {}
	feature_cols_to_extract.each do |feature|
		if(feature == "TranscriptLocation")
			h[feature]=transcriptLocation
		elsif(feature == "Strand")
			h["+Strand"]=strand
		elsif(feature == "FilterType")
			h["FilterType_not_NA"]=filterType
		elsif(feature == "GeneName")
			h["GeneName_not_NA"]=geneName
		elsif(feature == "TranscriptID")
			h["TranscriptID_not_NA"]=transcriptID
		else
			# do nothing
		end
	end
	return [h, transcriptLocation_categories]
end


def extract_features(data, feature_cols_to_extract, log_transform)
	feature_mat = []
	feature_names = []
	first = true
	selection_hash = {}
	data.each do |key, entry|
		h, transcriptLocation_categories = extract_features_single_entry(entry, feature_cols_to_extract, log_transform)
		if(selection_hash.keys.include?(entry["TranscriptLocation"]) == true)
			selection_hash[entry["TranscriptLocation"]] += 1
		else
			selection_hash[entry["TranscriptLocation"]] = 0
		end
		feature_mat.push([])
		h.each do |k, v|
			feature_mat.last.push(v)
			if(first == true)
				k.each do |name|
					if(name == "TranscriptLocation")
						transcriptLocation_categories.each do |category|
							feature_names.push(category)
						end
					else
						feature_names.push(k)
					end
				end				
			end
		end
		first = false
	end
	print "selection hash\n#{selection_hash.inspect}"
#	$stdin.gets
	return [feature_names, feature_mat]
end


################################################################
# Store entries from 'clusters_file' into hash 'cluster_data'.
################################################################
# one entry per line is processed using 'parse_cluster_file_line'
# and stored into hash 'cluster_data' using "ClusterID" as the 
# key for each entry
################################################################
def read_cluster_file(clusters_file, filter_params)
	dataset = IO.readlines(clusters_file)
	cluster_data = {}
	first_line = true
	loc = {"CDS"=>0, "3UTR"=>1}	
	strand = {"+"=>0, "-"=>1}
	dataset.each do |l|
		if(first_line == true)
			first_line=false
			next
		end
		h = parse_cluster_file_line(l)
		if(pass_filter(h, filter_params) == true)
			cluster_data[h["ClusterID"]] = h
		end
  	end
	return cluster_data
end


def read_groups_file(groups_file, filter_params)

	dataset = IO.readlines(groups_file)
	cluster_data = {}
	first_line = true
	cr = 0

	count = 0
	dataset.each do |l|
		if(first_line == true)
			first_line=false
			next
		end
		h = parse_groups_file_line(l)
		if(pass_filter(h, filter_params) == true)
			cluster_data[h["GroupID"]] = h
		else

		end
	end
	return cluster_data
end



def process_evidence(pars)

	###############################################
	# create directory structure and set parameters
	###############################################
	out_dir=pars["out_dir"]
	if(FileTest.exist?(out_dir) == false)
		Dir.mkdir(out_dir)
	end
	data_sub_dir=pars["data_sub_dir"]
	out_data_dir="#{out_dir}#{data_sub_dir}"
	if(FileTest.exist?(out_data_dir) == false)
		Dir.mkdir(out_data_dir)
	end
	clusters_file=pars["clusters_file"]
	log_transform=pars["log_transform"]
	fudge=pars["fudge"]
	###############################################
	# create directory structure and set parameters
	###############################################

	###########################################################
	# create output filename suffix based on 'filter' settings
	###########################################################
	out_suff=""
	["ConversionEventCount", "ReadCount", "TranscriptLocation", "FilterType"].each do |filter|
		if(pars["filter_params"][filter] != nil)
			if(filter != "TranscriptLocation")
				out_suff<<"#{filter}_"
			end
			out_suff<<"#{pars["filter_params"][filter]}"
		end
	end
	###########################################################
	# create output filename suffix based on 'filter' settings
	###########################################################
	
	##################################
	# read in 'clusters' file entries
	##################################

	if(pars["use_groups"] != "yes")
		print "reading 'clusters' #{pars["filter_params"]["TranscriptLocation"]}\n"
		cluster_data = read_cluster_file(clusters_file, pars["filter_params"])
	else
		print "reading 'groups' #{pars["filter_params"]["TranscriptLocation"]}\n"
		cluster_data = read_groups_file(clusters_file, pars["filter_params"])
	end
	sequence_feature_key = get_sequence_feature_key(pars["use_groups"])
	max_len = find_max_len(cluster_data, sequence_feature_key)
	print "max_len = ", max_len,"\n"
	cluster_data.each do |k, v|
		padded_seq = pad_X(v[sequence_feature_key], max_len)
		v[sequence_feature_key] = padded_seq + "$"+ padded_seq + "$"
	end
	score_feature_key = get_score_feature_key()

	data = cluster_data.sort{|a,b| b[1][score_feature_key] <=> a[1][score_feature_key]}
#	data.each do |k,v|
#		print "#{k}: #{v.inspect}\n"
#		$stdin.gets
#	end
	##################################
	# read in 'clusters' file entries
	##################################

	##################################
	# create 'features' file
	##################################
	out_features_file=pars["out_features_file"]
	features_file_name_full_path = "#{out_dir}#{data_sub_dir}#{out_features_file}"
	feature_cols_to_extract=pars["feature_cols_to_extract"]

	feature_names, feature_mat = extract_features(data, feature_cols_to_extract, log_transform)
	f_features = File.open(features_file_name_full_path, "w")
	f_feature_names = File.open("#{features_file_name_full_path}.names", "w")
	f_feature_names.write(feature_names.join("\t") + "\n") 
	f_feature_names.close()
	feature_mat.each do |e|
		f_features.write(e.join("\t") + "\n") 
	end
	f_features.close()
	##################################
	# create 'features' file
	##################################

	##################################
	# create 'evidence_file_list'
	##################################
	out_evidence_file=pars["out_evidence_file"]
	evidence_file_name="#{data_sub_dir}#{out_evidence_file}_#{out_suff}"
	f_evi_file_list = File.open("#{out_dir}evidence_file_list", "w")
	f_evi_file_list.write(evidence_file_name)
	f_evi_file_list.close()
	##################################
	# create 'evidence_file_list'
	##################################

	##################################
	# create 'sequence_file_list'
	##################################
	out_sequence_file=pars["out_sequence_file"]
	seq_file_name="#{data_sub_dir}#{out_sequence_file}_#{out_suff}"
	f_seq_file_list = File.open("#{out_dir}sequence_file_list", "w")
	f_seq_file_list.write(seq_file_name)
	f_seq_file_list.close()
	##################################
	# create 'sequence_file_list'
	##################################

	#########################################
	# create 'sequence' and 'evidence' files
	#########################################
	evidence_file_name_full_path = "#{out_dir}#{evidence_file_name}"
	seq_file_name_full_path      = "#{out_dir}#{seq_file_name}"
	f_evi = File.open(evidence_file_name_full_path, "w")
	f_seq = File.open(seq_file_name_full_path, "w")
	if(log_transform == true)
		min_val_score_feature = get_min_value_of_score_feature(data)
	end
	data.each do |k, v|
		score_feature_key = get_score_feature_key()
		score_feature = v[score_feature_key].to_f
		if(log_transform == true)
			if(min_val_score_feature <= 0)
				score_feature += (fudge - min_val_score_feature)
			end
			f_evi.write(k+"\t"+log2(score_feature).to_s+"\n")
		else
			f_evi.write(k+"\t"+score_feature.to_s+"\n")
		end
		f_seq.write(v[sequence_feature_key])
	end
	f_evi.close()
	f_seq.close()
	#########################################
	# create 'sequence' and 'evidence' files
	#########################################

	print "total number of output sequence regions = #{data.length}\n"
end


def run(pars_file, use_groups, feature_cols_to_extract)
	pars = ParamsReader.read_params_file(pars_file)

	pars["use_groups"] = use_groups
	pars["feature_cols_to_extract"] = feature_cols_to_extract

	################################################
	# filter settings for PAR-CLIP sequence clusters
	################################################
	pars["filter_params"] = {}
	# sample values: TranscriptLocation={3UTR, 5UTR, CDS, intron, 3UTR+CDS, 3UTR+CDS+intron,...}
	if(pars["TranscriptLocation"] == nil)
		print "WARNING: TranscriptLocation not specified!\n"
	else
		pars["filter_params"]["TranscriptLocation"] = pars["TranscriptLocation"]
		pars.delete("TranscriptLocation")
	end

	# allowed values: FilterType=NA, not_NA
	if(pars["FilterType"] != nil)
		pars["filter_params"]["FilterType"] = pars["FilterType"]
	end
	pars.delete("FilterType")

	# sample values: ConversionEventCount={1,2,3,...}
	if(pars["ConversionEventCount"] != nil)
		pars["filter_params"]["ConversionEventCount"] = pars["ConversionEventCount"]
	end
	pars.delete("ConversionEventCount")

	# sample values: ReadCount={1,2,3,...}
	if(pars["ReadCount"] != nil)
		pars["filter_params"]["ReadCount"] = pars["ReadCount"]
	end
	pars.delete("ReadCount")
	################################################
	# filter settings for PAR-CLIP sequence clusters
	################################################

	#############################	
	# transformation parameters
	#############################
	log_transform = false
	fudge = 1.0
	if(pars["log_transform"] != nil and pars["log_transform"]=="true")
		pars["log_transform"] = true
		if(pars["fudge"] != nil)
			pars["fudge"] = pars["fudge"].to_f
		else
			print "setting deafult: fudge = #{fudge}\n"
		end
	else
		pars["log_transform"] = false
	end
	#############################	
	# transformation parameters
	#############################

	##################
	# I/O parameters
	##################
	if(pars["clusters_file"] == nil)
		print "ERROR: need to specify clusters_file\n"
		exit
	end

	if(use_groups != "yes")
		pars["out_dir"] = pars["clusters_file"].split("_clusters.csv")[0]+"/"
		pars["out_dir"] = pars["clusters_file"].split(".csv")[0]+"/"
	else
#		pars["out_dir"] = pars["clusters_file"].split("_groups.csv")[0]+"/"
		pars["out_dir"] = pars["clusters_file"].split(".csv")[0]+"/"
	end

	print "clusters_file = #{pars["clusters_file"]}\n"
	print "output directory = #{pars["out_dir"]}\n"

	if(FileTest.exist?(pars["out_dir"]) == false)
		Dir.mkdir(pars["out_dir"])
	end

	#----------------------#
	# params with defaults #
	#----------------------#
	# sample value: out_evidence_file=evidence
	if(pars["out_evidence_file"] == nil)
		pars["out_evidence_file"] = "evidence"
	end
	# sample value: out_sequence_file=seq
	if(pars["out_sequence_file"] == nil)
		pars["out_sequence_file"] = "seq"
	end
	# sample value: out_features_file=features
	if(pars["out_features_file"] == nil)
		pars["out_features_file"] = "features"
	end
	# sample value: data_sub_dir=data/
	if(pars["data_sub_dir"] == nil)
		pars["data_sub_dir"] = "data/"
	end
	# sample value: out_params_file=parms_PARCLIP_input_processing
	if(pars["out_params_file"] == nil)
		pars["out_params_file"] = "parms_PARCLIP_input_processing"
	end
	out_pars_file = "#{pars["out_dir"]}#{pars["out_params_file"]}"
	create_output_params_file(out_pars_file, pars)
	#----------------------#
	# params with defaults #
	#----------------------#
	##################
	# I/O parameters
	##################

	process_evidence(pars)
end

################################
# parse command-line parameters
################################
def parse_ARGV()
  pars = {}
  ARGV.each do |a|
    pars[a.split("=")[0]] = a.split("=")[1]
  end
  return pars
end
pars = parse_ARGV()

# eample value: pars_file=preprocessing_PAR_CLIP.params
if(pars["params_file"] == nil)
	print "ERROR: need to specify params_file\n"
	print "command FORMAT: ./preprocess_PARCLIP.rb params_file=<file_name> groups=<yes/no>\n"
	exit
end

pars["groups"]="no"

# eample value: pars_file=preprocessing_PAR_CLIP.params
if(pars["groups"] == nil)
	print "ERROR: need to specify if only groups are used (default: PARalyzer clusters)\n"
	print "command FORMAT: ./preprocess_PARCLIP.rb params_file=<file_name> groups=<yes/no>\n"
	exit
end
################################
# parse command-line parameters
################################

#############
# run command
#############
#feature_cols_to_extract=["TranscriptLocation", "Strand", "FilterType", "GeneName", "TranscriptID"]
#feature_cols_to_extract=["TranscriptLocation"]
feature_cols_to_extract=[]
run(pars["params_file"], pars["groups"], feature_cols_to_extract)
print "pre-processing of input peaks done\n\n"
#############
# run command
#############

