#GRFoukas - Master Thesis

#This script generates the right data input with data manipulation and one-hot encoding
#of the cullpdb3class data (Princeton Data) that will be imported to the SNNS for the
#prediction of secondary protein structure.

import numpy as np

win_len = 101
padding = int(win_len/2)

new_file = open("new.txt", "w+")


with open("in.txt") as f:
    while True:
        line = f.readline()
        #print(line)
        if not line:
            break

            
        x = line.split('\t')
        x[0] = "X"*padding + x[0] + "X"*padding + "\n"
        new_file.write(x[0])
        x[1] = " "*padding + x[1][:-1] + " "*padding + "\n"
        new_file.write(x[1])
    
new_file.close()

new_file = open("new2.txt", "w+")
f = open("new.txt")

while True:
    line1 = f.readline()
    if not line1:
        break

    line2 = f.readline()
    moves = len(line1) - win_len
    for i in range(moves):
        part1 = line1[i:(i + win_len)]
        part2 = " "*padding + line2[i + int(win_len/2)] + " "*padding
        #print(part1 + "\n" + part2)
        new_file.write(part1 + "\n" + part2 + "\n")

f.close()
new_file.close()

#The code below creates the one-hot encoding for the data

#We get the current date and time in order to write it to the final file

from datetime import datetime

# datetime object containing current date and time
now = datetime.now()

# dd/mm/YY H:M:S
dt_string = now.strftime("%d/%m/%Y %H:%M:%S")

#Keys for the one-hot encoding of the data

keys = ['A', 'C', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'M', 'L', 'N', 'Q', 'P', 'S', 'R', 'T', 'W', 'V', 'Y']
keys.sort()
length = len(keys)
encoding_dict1 = {'X': "0 "*length}
decoding_dict1 = {'0 '*length: 'X'}
#encoding_dict2 = {'c': "0", 'e': '1', 'h':'2'}
#decoding_dict2 = {"1": 'c', '2':'e', 'h':'3'}
encoding_dict2 = {'c': "1 0 0", 'e': '0 1 0', 'h':'0 0 1'}
decoding_dict2 = {"1 0 0": 'c', '0 1 0':'e', 'h':'0 0 1'}

for i in range(1, length + 1):
    key = keys[i - 1]
    enc = "0 "*(i - 1) + "1 " + "0 "*(length - i)
    encoding_dict1[key] = enc
    decoding_dict1[enc] = key
    
f = open("new2.txt")
n_lines = 0
while True:
    line1 = f.readline()
   
    if not line1:
        break
    n_lines += 1

n_lines = int(n_lines/2)
f.close()
f = open("new2.txt")

new_file = open("encodings.txt", "w+")
#for i in range(1,win_len*20+1):
#    new_file.write(str(i)+"\t")
#new_file.write("Target\n")
    
new_file.write("SNNS pattern definition file V4.2" + "\n" + "generated at " + dt_string + "\n" + "\n" + "\n")
new_file.write("No. of patterns : " + str(n_lines) + "\n" + "No. of input units : " + str(win_len*20) + "\n" + "No. of output units : 3" + "\n" + "\n")

count = 1
while True:
    line1 = f.readline()
    
    count += 2
    if not line1:
        break
    encoding1 = ""
    for letter in line1[:-1]:
        encoding1 += encoding_dict1[letter]
    encoding1=encoding1[:-1]   
    
    line2 = f.readline()
    encoding2 = encoding_dict2[ line2[int(len(line2)/2 - 1)] ]
    
    new_file.write("# Input Pattern " + str(int((count-1)/2)) + ":" + "\n" + encoding1 + "\n" + "# Output Pattern " + str(int((count-1)/2)) + ":" + "\n" + encoding2 + "\n")
#    new_file.write(encoding1 + "\t" + encoding2 + "\n")

f.close()

new_file.close()
