//	This source and Text Subset Sequnce Length Count method apply to GNU General Public License. 
//			Copyright (C) 2001-2017 Jasenko Dzinleski 

//		This program is free software; you can redistribute it
//	and/or modify it under the terms of the GNU General Public License as
//	published by the Free Software Foundation; either version 2 of the
//	License, or (at your option) any later version. 

//	This program is distributed in the hope that it will be useful, but
//	WITHOUT ANY WARRANTY; without even the implied warranty of
//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
//	General Public License for more details. 

//	You should have received a copy of the GNU General Public License along
//	with this program; if not, write to the Free Software Foundation, Inc.,
//	51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.

//  	Text Subset Sequnce Length Count  
//	written by Dzinleski Jasenko  December 2016 , Jun 2017

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <ctype.h>
#include <math.h>

FILE		*f1;

char		infn[256];

char		sv[256];
long long 	laH[1000000];int laHi=0;
int       	lac[1000000]; 
char    	svc[256];
long long 	lb;
char    	lbs[256];
int 		sc;

double mpow(int d,int a){if(!a){return(1);}else{if(a==1){return(d);}else{double b=d;int c=2;while(c<=a){b*=d;++c;}return(b);}}}

int p_2(int sl)
{
int i,j;

	for(j=0;j<strlen(sv)-sl;++j)
	{
        	lb=0;sc=0;for(i=0;i<256;++i){lbs[i]='\0';}
		for(i=j;i<j+sl;++i)
		{
            		if(sv[i]=='A'){lbs[sc]='1';lbs[++sc]='0';lbs[++sc]='0';}else{
                		if(sv[i]=='C'){lbs[sc]='0';lbs[++sc]='1';lbs[++sc]='0';}else{
                    			if(sv[i]=='G'){lbs[sc]='1';lbs[++sc]='1';lbs[++sc]='0';}else{
                        			if(sv[i]=='T'){lbs[sc]='0';lbs[++sc]='0';lbs[++sc]='1';}else{lbs[sc]='1';lbs[++sc]='0';lbs[++sc]='1';}
            		}}}++sc;
		}
		lb=0;for(i=0;i<strlen(lbs)&&i<64;++i){if(lbs[i]=='1'){lb|=(long long)1<<i;}}
		        if(!laHi){laH[laHi]=lb;lac[laHi]=1;++laHi;}else{
		            for(i=0;i<laHi;++i){if(laH[i]==lb){++lac[i];break;}}if(i==laHi){laH[laHi]=lb;lac[laHi]=1;++laHi;}
		}
	}
	return(0);
}

int p_1(char infn_[256],int sl,int st,int al)
{
int	i,j,k,l,cc=0;
int	fb;
int	ec;

	laHi=0;for(i=0;i<256;++i){lac[i]=0;}

	f1=fopen(infn_,"rb");
	fb=getc(f1);
	while(!feof(f1))
	{
		for(i=0;i<256;++i){sv[i]='\0';}i=0;
		while((fb!=10)&&(fb!=13)){sv[i]=fb;++i;fb=getc(f1);}
		while((fb==10)||(fb==13)){fb=getc(f1);}++cc;
		if(cc>=st){p_2(sl);if(laHi>=al){break;}}
	}
	fclose(f1);

    	ec=0;for(i=0;i<laHi;++i){if(lac[i]==1){++ec;}}
    	for(i=0;i<laHi;++i)
    	{
		if(lac[i]>1)
		{
	        	for(j=0;j<256;++j){lbs[j]='\0';svc[j]='\0';}
	        	k=0;for(j=0;j<64;++j){lb=(long long)1<<j;if(((laH[i]&lb)>>j)==1){lbs[k]='1';}else{lbs[k]='0';}++k;}
	        	//printf("%s\n",lbs);
	        	l=0;sc=0;while(sc<strlen(lbs))
	        	{
	        	if(lbs[sc]=='1'&&lbs[1+sc]=='0'&&lbs[1+1+sc]=='0'){svc[l]='A';++l;}else{
	            		if(lbs[sc]=='0'&&lbs[1+sc]=='1'&&lbs[1+1+sc]=='0'){svc[l]='C';++l;}else{
	                		if(lbs[sc]=='1'&&lbs[1+sc]=='1'&&lbs[1+1+sc]=='0'){svc[l]='G';++l;}else{
	                    			if(lbs[sc]=='0'&&lbs[1+sc]=='0'&&lbs[1+1+sc]=='1'){svc[l]='T';++l;}else{svc[l]='_';++l;}
	        	}}}sc+=3;
	        	}
			k=0;for(j=0;j<strlen(svc);++j){if(svc[j]=='_'){++k;}}
			if(k!=strlen(svc)){printf("%s\t%d\t%d\t%e\n",svc,strlen(svc)-k,lac[i],log((double)ec/(double)lac[i]));}		
		}
	}

	return(0);
}

int main(int argc,char *argv[])
{

int	i,j,k,l,m,n;
int	a,b,c,cc=0;
char	sp1[256];
char	sp2[256];
int	ip1,ip2;

	if(argc<3){return(0);}else{strcpy(infn,argv[1]);f1=fopen(infn,"rb");if(f1==NULL){return(0);}fclose(f1);}
	
	strcpy(sp1,argv[2]);
	strcpy(sp2,argv[3]);

	ip1=0;j=strlen(sp1)-1;for(i=0;i<strlen(sp1);++i){ip1+=((int)sp1[i]-48)*(int)mpow(10,j);--j;}
	ip2=0;j=strlen(sp2)-1;for(i=0;i<strlen(sp2);++i){ip2+=((int)sp2[i]-48)*(int)mpow(10,j);--j;}

	p_1(infn,10,ip1,ip2);
	p_1(infn,11,ip1,ip2);
	p_1(infn,12,ip1,ip2);
	p_1(infn,13,ip1,ip2);
	p_1(infn,14,ip1,ip2);
	p_1(infn,15,ip1,ip2);
	p_1(infn,16,ip1,ip2);
	p_1(infn,17,ip1,ip2);
	p_1(infn,18,ip1,ip2);
	p_1(infn,19,ip1,ip2);
	p_1(infn,20,ip1,ip2);

	return(0);
}