//	This source and similar text sequences method apply to GNU General Public License. 
//			Copyright (C) 2001-2016 Jasenko Dzinleski 

//		This program is free software; you can redistribute it
//	and/or modify it under the terms of the GNU General Public License as
//	published by the Free Software Foundation; either version 2 of the
//	License, or (at your option) any later version. 

//	This program is distributed in the hope that it will be useful, but
//	WITHOUT ANY WARRANTY; without even the implied warranty of
//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
//	General Public License for more details. 

//	You should have received a copy of the GNU General Public License along
//	with this program; if not, write to the Free Software Foundation, Inc.,
//	51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.

//  	Similar Text Sequences 
//	
//	written by Dzinleski Jasenko  April 2016



#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>

FILE	*f1,*f2;

char	infn[256]="fasta__.txt";
char	outfn[256]="out_.txt";

int	fbf[1024];int fbfi;

int	dc[1024][50000][2];int dci;
int	pa[1024][1024];

char	ls[1024];int lsi;

int main(int argc,char *argv[])
{

int	i,j,k,l,m,n;
int	fb;

	f2=fopen(outfn,"w");
	f1=fopen(infn,"rb");
	fb=getc(f1);
	while(!feof(f1))
	{
		fbfi=0;while(fbfi<1024&&!feof(f1)){while(((fb==10)||(fb==13))&&!feof(f1)){fb=getc(f1);}fbf[fbfi]=fb;++fbfi;fb=getc(f1);}

		for(i=0;i<1024;++i){for(j=0;j<50000;++j){dc[i][j][0]=-1;}}dci=0;
		for(n=0;n<1024;++n){ls[n]='\0';}

		for(i=0;i<fbfi-1;++i)
		{
			for(j=i+1;j<fbfi;++j)
			{
				if(dc[j-i][0][0]==-1)
				{
					dc[j-i][0][0]=1;
					dc[j-i][dc[j-i][0][0]][0]=fbf[i];
					dc[j-i][dc[j-i][0][0]][1]=fbf[j];
					dc[j-i][dc[j-i][0][0]][2]=0;
				}else{
					for(k=0;k<=dc[j-i][0][0];++k)
					{
						if(dc[j-i][k][0]==fbf[i]&&dc[j-i][k][1]==fbf[j]){++dc[j-i][k][2];break;}
					}
					if(k+1==dc[j-i][0][0])
					{
						++dc[j-i][0][0];
						dc[j-i][dc[j-i][0][0]][0]=fbf[i];
						dc[j-i][dc[j-i][0][0]][1]=fbf[j];
						dc[j-i][dc[j-i][0][0]][2]=0;
					}
				}
			}
		}

		for(i=0;i<1024;++i){for(j=0;j<1024;++j){pa[i][j]=0;}}

		for(i=0;i<1024-1;++i)
		{
			for(j=i+1;j<1024;++j)
			{
				if(dc[j-i][0][0]!=-1)
				{
					for(k=1;k<=dc[j-i][0][0];++k)
					{
					//if(dc[j-i][k][2]>2){printf("%d %c %c %d\n",j-i,dc[j-i][k][0],dc[j-i][k][1],dc[j-i][k][2]);}
					if(dc[j-i][k][2]>2){pa[i][i+(j-i)]+=dc[j-i][k][2];}
					}
				}

			}
		}
		printf("======================================\n");
		for(i=0;i<1024;++i)
		{
			m=0;
			for(j=0;j<1024;++j)
			{
				if(pa[i][j]>0){if(m==0){m=pa[i][j];}else{if(m<pa[i][j]){m=pa[i][j];}}}
			}
			if(m>0)
			{
				for(j=0;j<1024;++j)
				{
					if(pa[i][j]>0)
					{
						if(((double)pa[i][j]/(double)m)>0.80)
						{
							printf("%c",fbf[j]);
							ls[lsi]=fbf[j];++lsi;
							if(lsi>1024){fprintf(f2,"%03d\t%s\n",lsi,ls);for(n=0;n<1024;++n){ls[n]='\0';}lsi=0;}
						}else{
							printf("_");
							if(strlen(ls)>=5){fprintf(f2,"%03d\t%s\n",lsi,ls);}for(n=0;n<1024;++n){ls[n]='\0';}lsi=0;
						}
					}else{printf("_");for(n=0;n<1024;++n){ls[n]='\0';}lsi=0;}
				}printf("\n");
			}
		}
	}
	fclose(f1);
	return(0);
}