//	This source and similar text sequences method apply to GNU General Public License. 
//			Copyright (C) 2001-2014 Jasenko Dzinleski 

//		This program is free software; you can redistribute it
//	and/or modify it under the terms of the GNU General Public License as
//	published by the Free Software Foundation; either version 2 of the
//	License, or (at your option) any later version. 

//	This program is distributed in the hope that it will be useful, but
//	WITHOUT ANY WARRANTY; without even the implied warranty of
//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
//	General Public License for more details. 

//	You should have received a copy of the GNU General Public License along
//	with this program; if not, write to the Free Software Foundation, Inc.,
//	51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.

//  	Segregation of Text Sequences by their Complexity  
//	written by Dzinleski Jasenko  December 2016

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>

FILE		*f1,*f2;

char		infn[256]="out_s2_s16.txt";
char		cv[256];int cvi=0;
char		sa[200000][256];int sai=0;
char		sa_[200000][3*256];
int		ba24[256][256][256];
double		dmc[3*256];
double		dmi[200000][2];int	dmi_=0;
double 		mci=0.0001;

int main(int argc,char *argv[])
{

int	a,b,c,mcc=0,cL;
int	i,j,k,l,m,n,mc;
int	fb;


	if(argc<2){return(0);}
	if(strlen(argv[1])==0){return(0);}
	strcpy(infn,argv[1]);

	for(i=0;i<256;++i){for(j=0;j<256;++j){for(k=0;k<256;++k){ba24[i][j][k]=0;}}}

	f1=fopen(infn,"rb");
	fb=getc(f1);
	while(!feof(f1))
	{
		cvi=0;while((fb!=10)&&(fb!=13)&&!feof(f1)){cv[cvi]=fb;++cvi;fb=getc(f1);}while((fb==10)||(fb==13)&&!feof(f1)){fb=getc(f1);}if(cvi<6){continue;}
		for(j=0;j<256;++j){sa[sai][j]='\0';}
		k=0;for(i=0;i<cvi;++i){sa[sai][k]=cv[i];++k;}
		for(j=0;j<3*256;++j){sa_[sai][j]='\0';}
		j=0;for(l=0;l<=-3+strlen(sa[sai]);++l)
		{
			if(!j)
			{
				a=0;b=l;
				while(a<3&&b<strlen(sa[sai])){sa_[sai][j]=sa[sai][b];++j;++b;++a;}
			}else{
				a=0;b=l;
				while(a<3&&b<strlen(sa[sai])){sa_[sai][j]=sa[sai][b];++j;++b;++a;}
				for(a=0;a<-3+strlen(sa_[sai]);a+=3)
				{c=-3+j;if((sa_[sai][a]==sa_[sai][c])&&(sa_[sai][1+a]==sa_[sai][++c])&&(sa_[sai][1+1+a]==sa_[sai][++c])){break;}}
				if(a!=(-3+strlen(sa_[sai]))){sa_[sai][-1-1-1+j]='\0';sa_[sai][-1-1+j]='\0';sa_[sai][-1+j]='\0';j-=3;}
			}
		}
		//printf("%d\t%s\t%s\n",sai,sa[sai],sa_[sai]);
		++sai;
	}
	fclose(f1);

	for(i=0;i<sai;++i){for(j=0;j<=-3+strlen(sa_[i]);j+=3){++ba24[sa_[i][j]][sa_[i][1+j]][sa_[i][1+1+j]];}}

	// 1

	while(mci<0.9)
	{
		for(i=0;i<sai;++i)
		{
			m=0;n=9999999;
			for(j=0;j<=-3+strlen(sa_[i]);j+=3)
			{
				m+=ba24[sa_[i][j]][sa_[i][1+j]][sa_[i][1+1+j]];
				if(n==9999999){n=ba24[sa_[i][j]][sa_[i][1+j]][sa_[i][1+1+j]];}else{
					if(n>ba24[sa_[i][j]][sa_[i][1+j]][sa_[i][1+1+j]]){n=ba24[sa_[i][j]][sa_[i][1+j]][sa_[i][1+1+j]];}
				}
			}
			//printf("%s\n",sa[i]);
			k=0;
			for(j=0;j<=-3+strlen(sa_[i]);j+=3)
			{	
			//printf("%e\t%e\t%e\n",
			//(double)n/(double)m,
			//(double)ba24[sa_[i][j]][sa_[i][1+j]][sa_[i][1+1+j]]/(double)m,
			//((double)n/(double)m)/((double)ba24[sa_[i][j]][sa_[i][1+j]][sa_[i][1+1+j]]/(double)m)
			//);
			dmc[k]=((double)n/(double)m)/((double)ba24[sa_[i][j]][sa_[i][1+j]][sa_[i][1+1+j]]/(double)m);++k;
			}
			mc=0;for(j=0;j<k;++j){if(dmc[j]<=mci){++mc;}}
			if(mc>(int)((strlen(sa_[i])/3)/2)){++mcc;}
		}
		if(mcc>0){if(!dmi_){dmi[dmi_][0]=mcc;dmi[dmi_][1]=mci;++dmi_;}else{if((double)mcc>dmi[-1+dmi_][0]){dmi[dmi_][0]=mcc;dmi[dmi_][1]=mci;++dmi_;}}}
		mcc=0;mci+=0.0001;//printf("%d\n",dmi_);
	}
	for(i=0;i<dmi_;++i){printf("%d\t%e\n",(int)dmi[i][0],dmi[i][1]);}
	for(i=0;i<dmi_;++i){if(!i){m=(int)dmi[i][0]-(int)dmi[-1+i][0];}else{if(m<((int)dmi[i][0]-(int)dmi[-1+i][0])){j=i;m=((int)dmi[i][0]-(int)dmi[-1+i][0]);}}}
	printf("\n%d\t%e\t%d\n\n",(int)dmi[j][0],dmi[j][1],m);

	// 2

	double mcv;mcv=dmi[j][1];
	
	for(i=0;i<sai;++i)
	{
		m=0;n=9999999;
		for(j=0;j<=-3+strlen(sa_[i]);j+=3)
		{
			m+=ba24[sa_[i][j]][sa_[i][1+j]][sa_[i][1+1+j]];
			if(n==9999999){n=ba24[sa_[i][j]][sa_[i][1+j]][sa_[i][1+1+j]];}else{
				if(n>ba24[sa_[i][j]][sa_[i][1+j]][sa_[i][1+1+j]]){n=ba24[sa_[i][j]][sa_[i][1+j]][sa_[i][1+1+j]];}
			}
		}
		//printf("%s\n",sa[i]);
		k=0;
		for(j=0;j<=-3+strlen(sa_[i]);j+=3)
		{	
		//printf("%e\t%e\t%e\n",
		//(double)n/(double)m,
		//(double)ba24[sa_[i][j]][sa_[i][1+j]][sa_[i][1+1+j]]/(double)m,
		//((double)n/(double)m)/((double)ba24[sa_[i][j]][sa_[i][1+j]][sa_[i][1+1+j]]/(double)m)
		//);
		dmc[k]=((double)n/(double)m)/((double)ba24[sa_[i][j]][sa_[i][1+j]][sa_[i][1+1+j]]/(double)m);++k;
		}
		mc=0;for(j=0;j<k;++j){if(dmc[j]<=mcv){++mc;}}
		if(mc>(int)((strlen(sa_[i])/3)/2)){printf("%s\n",sa[i]);}
	}

	return(0);
	
}