#!/usr/bin/env ruby

require_relative "params_reader.rb"

########################
# helper functions
########################
def complement_proc(el)
    if(el == "A") 
        el = "T"
      elsif(el == "T")
      el = "A"
      elsif(el == "G") 
        el = "C"
      elsif(el == "C") 
      el = "G"
      elsif(el == "a")
      el = "t"
      elsif(el == "t")
        el = "a"
      elsif(el == "g")
      el = "c"
      elsif(el == "c") 
      el = "g"
      else
        el = el
      end
      return el
end  

# returns the invert-complement of a sequence of nucleotides
def invertAndComplSequence(seq)
  return seq.split(//).reverse.collect {|el| complement_proc(el)}.join
end

def num_unique_kmers(seq, k)
  kmers = {}
  (0..(seq.length-k)).each do |i|
    kmers[seq[i..(i+k-1)]]=1
  end
#  print kmers.inspect
  return kmers.length
end
########################
# helper functions
########################

#def estimate_seq_stats(seq, rev_compl_flag)
#
#	length = seq.strip.gsub(/\$/, "").gsub(/X/, "").length
#	k=5
#	num_uniqe_5mers = num_unique_kmers(seq, k)
# 
#	mm0 = 	{
#		"A"=>0,
#		"T"=>0,
#		"C"=>0,
#		"G"=>0
#		}
#
#	mm1 = 	{
#		"AA"=>0,"AG"=>0,"AC"=>0,"AT"=>0,
#         	"GA"=>0,"GG"=>0,"GC"=>0,"GT"=>0,
#         	"CA"=>0,"CG"=>0,"CC"=>0,"CT"=>0,
#         	"TT"=>0,"TG"=>0,"TC"=>0,"TA"=>0
#		}
#
#	if(mm0[seq[0..0]] != nil)
#		mm0[seq[0..0]] += 1
#	end
#	(1...seq.length).each do |i|
#		e = seq[i..i]
#		if(mm0[e] != nil)
#			mm0[e] += 1
#		end
#		ee = seq[i-1..i]
#		if(mm1[ee] != nil)
#			mm1[ee] += 1
#		end
#	end
#
#	stats = {
#		"MM0"=>mm0, 
#		"MM1"=>mm1, 
#		"length"=> {"length" => length}, 
#		"num_uniqe_5mers"=>{"num_uniqe_5mers"=>num_uniqe_5mers}
#		}
#
#	if(rev_compl_flag == true or rev_compl_flag == "true")
#		stats = {
#				"MM0" => {
#					"T"=>mm0["T"]*2,
#					"C"=>mm0["C"]*2},
#				"MM1" => {
#					"AC"=>mm1["AC"]*2, "CA"=>mm1["CA"]*2, "TC"=>mm1["TC"]*2,
#					"TT"=>mm1["TT"]*2, "CC"=>mm1["CC"]*2, "CT"=>mm1["CT"]*2, 
#					"CG"=>mm1["CG"],   "AT"=>mm1["AT"]
#					 },
#				"length" => {"length" => length},
#				"num_uniqe_5mers"=>{"num_uniqe_5mers"=>num_uniqe_5mers}
#			}
#	else
#		stats["MM0"].each do |k, v|
#			stats["MM0"][k] = sprintf("%4d", v.to_i / 2)
#		end
#		stats["MM1"].each do |k, v|
#			stats["MM1"][k] = sprintf("%4d", v.to_i / 2)
#		end
#		stats["length"]["length"] = stats["length"]["length"] / 2
#		stats["num_uniqe_5mers"]["num_uniqe_5mers"] = stats["num_uniqe_5mers"]["num_uniqe_5mers"] / 2
#	end
#  return stats
#end
#seq = "CTTTTTTTCTTTXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX$CTTTTTTTCTTTXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX$"
#rev_compl_flag = false
#estimate_seq_stats(seq, rev_compl_flag)



def estimate_seq_stats(seq, rev_compl_flag)

	length = seq.strip.gsub(/\$/, "").gsub(/X/, "").length
	k=5
	num_uniqe_5mers = num_unique_kmers(seq, k)
 
	mm0 = 	{
		"A" => 0,
		"T" => 0,
		"C" => 0,
		"G" => 0
		}

	mm1 = 	{
		"AA"=>0,"AG"=>0,"AC"=>0,"AT"=>0,
         	"GA"=>0,"GG"=>0,"GC"=>0,"GT"=>0,
         	"CA"=>0,"CG"=>0,"CC"=>0,"CT"=>0,
         	"TT"=>0,"TG"=>0,"TC"=>0,"TA"=>0
		}

	if(mm0[seq[0..0]] != nil)
		mm0[seq[0..0]] += 1
	end
	(1...seq.length).each do |i|
		e = seq[i..i]
		if(mm0[e] != nil)
			mm0[e] += 1
		end
		ee = seq[i-1..i]
		if(mm1[ee] != nil)
			mm1[ee] += 1
		end
	end

	stats = {
		"MM0"=>mm0, 
		"MM1"=>mm1, 
		"length"=> {"length" => length}, 
		"num_uniqe_5mers"=>{"num_uniqe_5mers"=>num_uniqe_5mers}
		}

	if(rev_compl_flag == true)
#		print "rev_compl_flag == true\n"
#		$stdin.gets
		stats = {
				"MM0" => {
					"T_or_A"=>mm0["T"],
					"C_or_G"=>mm0["C"]},
				"MM1" => {
					"AC_or_GT"=>mm1["AC"], "CA_or_TG"=>mm1["CA"],
					"TC_or_GA"=>mm1["TC"], "CT_or_AG"=>mm1["CT"],
					"TT_or_AA"=>mm1["TT"], "CC_or_GG"=>mm1["CC"],
					"CG"=>mm1["CG"]      , "GC"=>mm1["GC"],
					"AT"=>mm1["AT"]      , "TA"=>mm1["TA"]
					 },
				"length" => {"length" => length},
				"num_uniqe_5mers"=>{"num_uniqe_5mers"=>num_uniqe_5mers}
			}
	else
		stats["MM0"].each do |k, v|
			stats["MM0"][k] = sprintf("%4d", v.to_i / 2)
		end
		stats["MM1"].each do |k, v|
			stats["MM1"][k] = sprintf("%4d", v.to_i / 2)
		end
		stats["length"]["length"] = stats["length"]["length"] / 2
		stats["num_uniqe_5mers"]["num_uniqe_5mers"] = stats["num_uniqe_5mers"]["num_uniqe_5mers"] / 2
	end
  return stats
end










def extract_seq_stats(sequence_file, rev_compl_flag)

	annot_arr = []
	col_names = []

	data=[]
	count = 0
	seq=""
	print "sequence_file: #{sequence_file}\n"
	lines = IO.readlines(sequence_file)[0].split("$")
	lines.each do |l|
		seq<<l+"$"
		count += 1
		if(count % (lines.length / 20) == 0)
			print (count / (lines.length / 20))*(100 / 20),"% "
			$stdout.flush 
		end
		if(count % 2 == 0)
			stats = estimate_seq_stats(seq, rev_compl_flag)
			evidence_entry_index = count / 2 - 1
			if(annot_arr != [])
				(0...annot_arr[evidence_entry_index].length).each do |i|
					if(col_names[i] != "ClusterID")
						stats[col_names[i]] = {col_names[i]=>annot_arr[evidence_entry_index][i]}
					end
				end
			end
			data.push([stats, seq])
			seq=""  
		end
	end
	return data
end
#seq_file="/nfs/labs/compbio/sgeorg/Arabidopsis/Miguel/data/12K_arab_3000_upstream_processed_sorted"
#extract_seq_stats(seq_file)




def build_entry(stats, params)
#	print "stats.keys.inspect: #{stats.keys.inspect}\n"
	entry = []
	params.each do |k, v|
#		print "k = #{k}: v = #{v}\n"
		if(v == true and stats[k] != nil)
			entry.push(stats[k].values)
		end
	end
	return entry.flatten
end

def build_entry_names(stats, params)
	entry = []
	params.each do |k, v|
#		print "k = #{k}: v = #{v}\n"
		if(v == true and stats[k] != nil)
			entry.push(stats[k].keys)
		end
	end
	return entry
end

def build_out_file_name(out_file_core, params)
	out_file_modified = out_file_core
	first = true
	params.each do |k, v|
#		print "k = #{k}: v = #{v}\n"
		if(v == true)
			if(first == false)
				out_file_modified<<"_"
			end
			first = false
			out_file_modified<<"#{k}"
		end
	end
  	return out_file_modified
end

def output_seq_stats(data, out_file, features_file_full_path, params)

	features_file_exists=true
	if(FileTest.exist?(features_file_full_path) == false or FileTest.exist?(features_file_full_path+".names") == false)
		features_file_exists=false
	end

	stats = data[0][0]
	entry_names = build_entry_names(stats, params)

	entry = build_entry(stats, params)
	num_entries = entry.length
	num_lines = data.length
	if(num_entries == 0)
		print "empty file!\n"
		return
	end
	out_file_modified = build_out_file_name(out_file, params) 
	out_file_entry_names = out_file_modified

	features=[]	
	feature_names = ""
	if(features_file_exists == true)
		feature_names = IO.readlines(features_file_full_path+".names")[0].strip
		features = IO.readlines(features_file_full_path)

		out_file_modified = out_file_modified + "#{feature_names.split("\s").join("_")}"
		out_file_entry_names = out_file_entry_names + "#{feature_names.split("\s").join("_")}.var_names"

		f_out = File.open(out_file_entry_names, "w")
		f_out.write("#{feature_names}\t")
		f_out.close()
	else
		out_file_entry_names= out_file_entry_names + ".var_names"
	end

	f_out = File.open(out_file_entry_names, "a")
	f_out.write("#{entry_names.join(" ")}\n")
	f_out.close()

	print "\nout_file = #{out_file_modified}\n"
	print "\nout_file_names = #{out_file_entry_names}\n"
	print "-----------------------\n"

	f_out = File.open(out_file_modified, "w")
	if(features != [])
		num_entries += feature_names.split("\s").length
	end
	f_out.write("#{num_lines}\t#{num_entries}\n")
	ind = 0
	data.each do |e|
		if(features != [])
			feature_data = features[ind].strip+"\t"
			f_out.write(feature_data)
			ind+=1
		end

		stats = e[0]
		entry = build_entry(stats, params)
		out_line = "#{entry.join(" ")}\n"
		f_out.write(out_line)
	end
	f_out.close()
end

def do_seq_stats(io_params, params_instance)
	features_file_full_path="#{io_params["in_dir"]}#{io_params["features_file"]}"
	seq_file_full_path="#{io_params["in_dir"]}#{io_params["sequence_file"]}"

	print "extract_seq_stats\n"
	data = extract_seq_stats(seq_file_full_path, io_params["rev_compl_flag"])
	print "done\n"
#	$stdin.gets

	out_dir=io_params["out_dir"]
	if(FileTest.exist?(out_dir) == false)
		print "directory\n", out_dir, "\ndoesn't exist creating..."
		Dir.mkdir(out_dir)
		print "done\n"
	end
	out_file_full_path="#{io_params["out_dir"]}"
	output_seq_stats(data, out_file_full_path, features_file_full_path, params_instance)
end



def extract_all_combos(io_params, params, keys_arr, params_instance, active_key_index)
	if(active_key_index == keys_arr.length)
		count = 0
		all_selections_false = true
		params_instance.each do |k, v|
			if(v == true)
				all_selections_false = false
			end
			print "#{k} = #{v}"
			if(count == params_instance.length - 1)
				print "\n"
			else
				print ", "
			end	
			count += 1
		end
		if(all_selections_false == true)
			return
		end
		do_seq_stats(io_params, params_instance)
		return
	end
	params[keys_arr[active_key_index]].each do |v|
		params_instance[keys_arr[active_key_index]] = v
		extract_all_combos(io_params, params, keys_arr, params_instance, active_key_index + 1)
	end
end



#########################
# command-line parameters
#########################

def parse_ARGV()
  pars = {}
  ARGV.each do |a|
    pars[a.split("=")[0]] = a.split("=")[1]
  end
  return pars
end
pars = parse_ARGV()



def run(pars)
	###############################################
	# IO parameters (provided in key-value format)
	###############################################
	io_params = {}
	io_params_data = ParamsReader.read_params_file(pars["io_params_file"])
	out_suff=""
	["ConversionEventCount", "ReadCount", "TranscriptLocation", "FilterType"].each do |filter|
		if(io_params_data[filter] != nil)
			if(filter != "TranscriptLocation")
				out_suff<<"#{filter}_"
			end
			out_suff<<"#{io_params_data[filter]}"
		end
	end
#	print "out_suff = #{out_suff}\n"

	# use in case filename with suffix does exist
	if(io_params_data["clusters_file"] != nil and FileTest.exist?(io_params_data["clusters_file"]) == true)
#		split_suff = io_params_data["clusters_file"].scan(/_[^_]*.csv/)[0]
#		split_suff = File.basename(io_params_data["clusters_file"])
#		print "split_suff = #{split_suff}\n"

#		root_dir=io_params_data["clusters_file"].split(split_suff)[0]+"/"
#		io_params["out_dir"]="#{root_dir}covariates/"
#		io_params["in_dir"]="#{root_dir}data/"

		root_dir=File.dirname(io_params_data["clusters_file"]) + "/" + File.basename(io_params_data["clusters_file"]).split(".csv")[0]
		io_params["in_dir"]=root_dir + "/" + File.dirname(io_params_data["clusters_file"]) +"/"
		io_params["out_dir"]="#{root_dir}/covariates/"


		io_params["sequence_file"]="seq_#{out_suff}"
#		seq_file_full_path="#{io_params["in_dir"]}#{io_params["sequence_file"]}_#{out_suff}"
		seq_file_full_path="#{io_params["sequence_file"]}_#{out_suff}"

		print "seq_file_full_path = #{seq_file_full_path}\n"


		# use in case filename with suffix does exist
		if(FileTest.exist?(seq_file_full_path) == true)
			io_params["sequence_file"]="seq"
		end
	else
		if(io_params_data["seq_file"] != nil and FileTest.exist?(io_params_data["seq_file"]) == true)
			arr = io_params_data["seq_file"].split("/")
			io_params["sequence_file"]=arr.last
			arr.pop()
			io_params["in_dir"]="./#{arr.join("/")}/"
		else
			print "ERROR: 'extract_covariates.rb', a sequence file or a peaks file must be specified as input!\n"
			exit
		end

		out_dir="./default_extract_cov_out/"
		if(io_params_data["out_path_covariates"] != nil)
			out_dir=io_params_data["out_path_covariates"]
			if(out_dir[0..0] != ".")
				out_dir = "./"+out_dir
			end
			if(out_dir[-1..-1] != "/")
				out_dir<<"/"
			end	
		end

		if(FileTest.exist?(out_dir) == false)
			Dir.mkdir(out_dir)
		end
		cov_sub_dir="#{out_dir}covariates/"
		if(FileTest.exist?(cov_sub_dir) == false)
			Dir.mkdir(cov_sub_dir)
		end
		io_params["out_dir"]="#{out_dir}"
	end

	# default assumption is 'rev_compl_flag' = true
	io_params["rev_compl_flag"] = true
	if(io_params_data["strand_specific_analysis"] == "true")
		io_params["rev_compl_flag"] = false
	end

	io_params["features_file"] = "features"

	print "in_dir: #{io_params["in_dir"] }\n"
	print "out_dir: #{io_params["out_dir"] }\n"
	print "sequence regions file: #{io_params["sequence_file"]}\n"
	print "====================\n"
	###############################################
	# IO parameters (provided in key-value format)
	###############################################

	##################################################################
	# required command-line parameters (provided in key-value format)
	##################################################################
	# covariates.params
	# example: params_file=covariates.params
	if(pars["params_file"] == nil)
		print "ERROR: need to specify params_file\n"
		exit 
	end
	params = {}
	ParamsReader.read_params_file(pars["params_file"]).each do |e|
		params[e[0]] = []
		e[1].split(",").each do |val|
			if(val == "true")
				params[e[0]].push(true)			
			elsif(val == "false")
				params[e[0]].push(false)			
			else
				print "ERROR parameter values must be se to true or false or both!\n"
				exit	
			end
		end
	end
	options = {}
	options["params"] = params
	##################################################################
	# required command-line parameters (provided in key-value format)
	##################################################################
	
	##########################
	# eaxtract covariates 
	##########################
	active_key_index = 0
	params_instance  = {}
	extract_all_combos(io_params, options["params"], options["params"].keys, params_instance, active_key_index)
	##########################
	# eaxtract covariates 
	##########################
end


#############
# run command
#############
run(pars)
# ./extract_seq_covariates_new.rb in_dir=./test_out/data/ params_file=covariates.params io_params_file=preprocessing_PAR_CLIP.params
print "extracting covariates done\n\n"
#############
# run command
#############
