/*** Program prep_psiblastali.c : slightly civilized version of prep_ali_97.c for public release

 prepares PSI BLAST alignment  (in pseudo-Clustal format)
 for further ali-ali comparisons as follows:
all columns with gaps in QUERY sequence are deleted;
any sequence identical to QUERY is purged;
only 1 copy is retained of any sequences that have identity > threshold.
Threshold value 97% is used.
Output file : <ARG_0>.<id_threshold>,  <id_threshold>=97.

***/

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <ctype.h>
#include <string.h>
#include <malloc.h>
#include <stddef.h>


#define FREE_ARG char*

#define SQUARE(a) ((a)*(a))
#define NUM_METHOD 9
#define MAX_WINDOW 20
#define MAX_DELTASITE 20
#define MAXSTR   100001
#define INDI -100

#define JMAX 40                                         
#define IA 16807
#define IM 2147483647
#define AM (1.0/IM)
#define IQ 127773
#define IR 2836
#define NTAB 32
#define NDIV (1+(IM-1)/NTAB)
#define EPS 1.2e-7
#define RNMX (1.0-EPS)

#define NRANSI
#define SWAP(a,b) temp=(a);(a)=(b);(b)=temp;
#define M 7
#define NSTACK 50

#define LAMB_UNG 0.009925

#define IDTHRESH_NUM  1

int idthresh[] = {97}; 
char *exten_id[] = {".97"};
		
char namebf[100];

		
char *digit="0123456789";
void nrerror(char error_text[]);
char *cvector(long nl, long nh);
int *ivector(long nl, long nh);
double *dvector(long nl, long nh);
char **cmatrix(long nrl, long nrh, long ncl, long nch);
int **imatrix(long nrl, long nrh, long ncl, long nch);
double **dmatrix(long nrl, long nrh, long ncl, long nch);
char **cmatrix(long nrl, long nrh, long ncl, long nch);
double ***d3tensor(long nrl,long nrh,long ncl,long nch,long ndl,long ndh);

void free_ivector(int *v, long nl, long nh);
void free_dvector(double *v, long nl, long nh);
void free_cvector(char *v, long nl, long nh);
void free_dmatrix(double **m, long nrl, long nrh, long ncl, long nch);
void free_imatrix(int **m, long nrl, long nrh, long ncl, long nch);
void free_cmatrix(char **m, long nrl, long nrh, long ncl, long nch);


int a3let2num(char *let);
int am2num_c(int c);
int am2num(int c);
int am2numBZX(int c);

static void *mymalloc(int size);
char *strsave(char *str);
char *strnsave(char *str, int l);
static char **incbuf(int n, char **was);
static int *incibuf(int n, int *was);

void err_readali(int err_num);
void readali(char *filename);
static void printali(char *argo, int chunk, int n, int len, char **aname, char **aseqGap, int *start);

int **ali_char2int(char **aseq,int start_num, int start_seq);
int **read_alignment2int(char *filename,int start_num,int start_seq);

char **traceback(char **aseq_mat1, char **aseq_mat2, int n1, int n2, int start_ali1, int start_ali2, int
end_ali1, int end_ali2, int **tracebackDir, int **flagNewGapQuery, int **flagNewGapDb, char **aseqGapTr1, char **aseqGapTr2);

char **tracebackPos(char **aseq_mat1, char **aseq_mat2, int n1, int n2, int start_ali1, int start_ali2, int
end_ali1, int end_ali2, int **tracebackDir, int **flagNewGapQuery, int **flagNewGapDb, char **aseqGapTr1, char **aseqGapTr2, int *apos1, int *apos2);


void counter(int b);
double effective_number(int **ali, int *marks, int n, int start, int end);
double effective_number_nogaps(int **ali, int *marks, int n, int start, int end);
double effective_number_nogaps_expos(int **ali, int *marks, int n, int start, int end, int pos);

void **freqInt(int **ali,int nal, int alilen, int **f,int *num_gaps,int
*effindiarr,double gap_threshold, double *p_comp);
void **freqIntMaskGaps(int **ali,int nal, int alilen, int **f, double gap_threshold, double gapRegionMin, double *p_comp);




typedef struct _conv_info{
        double **fq1, **fq2, **hfq1, **hfq2, **icfq1, **icfq2;
        char *alifilename1, *alifilename2;
        int alignlen;
	int nali;
	int *ngap1, *ngap2;
	int gapless50;
	double eff_num_seq;
	double *over_all_frq;
	int  *eff_indi_arr1, *eff_indi_arr2;
	double *avc,*csi;
        double ***conv;
            } conv_info;

 	   typedef struct _score_Vector{
		int *noGap, *gapExists, *noGapOld, *gapExistsOld, *prevScoreGapQueryOld, *noGapStore, *gapExistsStore, *prevScoreGapQueryOldStore;
	   } score_Vector;

	int *dbSequence, queryEnd, dbEnd, queryStart,dbStart;
	int gapOpen, gapExtend, dbLength, queryLength;
/*****	double **query, **matrix; *****/
	static int basicSmithWatermanScoreOnly(int **query, int
queryLength, int *dbSequence, int dbLength, int **matrix, int gapOpen,
int gapExtend, int queryEnd, int dbEnd, int **tracebackDir, int **flagNewGapQuery, int **flagNewGapDb);

/*, Int4 *score ,BLAST_KarlinBlkPtr kbp, Nlm_FloatHi L, Nlm_FloatHi effSearchSpace, Nlm_FloatHi minGappedK)  */

	static int SmithWatermanFindStartGivenEnd(int **query,int
queryLength,
int *dbSequence, int dbLength, int **matrix, int gapOpen, int
gapExtend,
int queryEnd, int dbEnd, int score, int queryStart, int dbStart);

int score, End1, End2, Start1, Start2;

int gap_open = 320, gap_extend = 32;

int *Sequence2;
double lambda_al, score_scale;

int ScoreForTwoRows(double *subjectRow, double *queryRow);
double ScoreForTwoRows_Model(int *cntRow1, int *cntRow2, double *pseudoCntRow1, double *pseudoCntRow2);

double ScoreForTwoRows_Model6(int pos1, int pos2, double score_scale, double b);
double ScoreForTwoRows_Model6_smat(int pos1, int pos2);
double Sgap6_smat(int pos1, int pos2, double b);

double GapExtend1(int pos2, double b);
double GapExtend2(int pos1, double b);

int *ScoreOverColumn (int colScore, int flag1, int flag2, int flag3, int flag4, int flag5, int flag6, int *column_Score);

char **aname, **aname1, **aname2, **aseq, **aseq1, **aseq2;
int nal, nal1, nal2, nalmerge, alilen, alilen1, alilen2, maxalilen,
*astart, *astart1, *astart2, *alen;
int *aposnogp1, *aposnogp2;
int **align_mat1, **align_mat2;
int n_lowgaps, alilen_mat1, alilen_mat2;
char **aseq_mat1, **aseq_mat2;
char **aseqGapTr1, **aseqGapTr2;
int **tracebackDir;
int **flagNewGapQuery, **flagNewGapDb;
int *positive, **col_score;
int posGp, segment_len;

int *apos1, *apos2, *aposref1, *aposref2;
int start_ref1, start_ref2, end_ref1, end_ref2, reflen_nogp;
double coverage1, coverage2, falsecov, accuracy1, accuracy2;

int **alignment1, **alignment2;
double **u_oaf,**h_oaf;
char *am="-WFYMLIVACGPTSNQDEHRKBZX*.wfymlivacgptsnqdehrkbzx";
char *am3[]={
"---",
"TRP",
"PHE",
"TYR",
"MET",
"LEU",
"ILE",
"VAL",
"ALA",
"CYS",
"GLY",
"PRO",
"THR",
"SER",
"ASN",
"GLN",
"ASP",
"GLU",
"HIS",
"ARG",
"LYS",
"ASX",
"GLX",
"UNK",
"***"
"...",
};

double *nmlconv(double *conv, conv_info cvf, int wn,int mi);

void argument();
void print_parameters(FILE *outfile,char *argi,char *argo,int nt,char *argt,int argb,char *args,int argm,int argf,int argc, int argw, char *argn,char *arga,double argg, char *argp,char *argd);


int **markSimilarSeqs (char **aseq, int len, int n, int *idthr, int nthr);

double **score_matrix, *score_matrix_srt;
int **newScore_mat;
int **matrix1, **matrix2;

double *ident1, *ident2;
int **count;
int setsize = 1000;
int *maskgaps, *maskgaps1, *maskgaps2, *maskgapRegion, *maskgapRegion1, *maskgapRegion2;
double **pseudoCnt1, **pseudoCnt2;
double *p_comp1, *p_comp2;
double n_c;
double n_eff1, n_eff2;
double **n_effAa1, **n_effAa2; 
double *sum_eff_let1, *sum_eff_let2;

int scoreGivenEnd, score_final;

double b = 1.0;
double f= 32.0;

int **fV_RepeatOpenGapQuery, **fV_RepeatOpenGapDb;
int **fV_DbInClosestNewGapDb, **fV_QInClosestNewGapQuery;

main(int argc, char *argv[])
{

	FILE *fout, *fpdb,*matrixfile,*fpdbout,*fp,*ft;
	int i,j,k,l,nt=0;
	int jposnogp, jmat;
	int fcount=0, fi=0;
	int **smatrix;
	char ARG_I[200],ARG_O[200],ARG_P[100],ARG_D[50],ARG_S[50],ARG_N[50],ARG_A[50];
	int ARG_F=2,ARG_C=0,ARG_V=0,ARG_M=0,ARG_B=60;
	double ARG_G=1.0, ARG_T=1.0;
	char *outfile;
	char *sqbuf;
	int jj;

	int *Sequence2;

	int jnogapQ, alilen_nogapQ;
	int i_nosim, nal_nosim;
	int **mark_sim;


	/*read input arguments */
        if(argc<=2) { argument(); exit(0);}
	for(i=1;i<argc;i++) {
	    if(strcmp(argv[i],"-i")==0) {strcpy(ARG_I,argv[i+1]);i++;continue;}
	    if(strcmp(argv[i],"-o")==0) {strcpy(ARG_O,argv[i+1]);i++;continue;}
				}
	
        if((ARG_F>2)||(ARG_F<0)){fprintf(stderr,"frequency calculation method(-f): \n0, unweighted; 1, Henikoff weight; 2, independent count\n");
                    exit(0);}
        if((ARG_C>2)||(ARG_C<0)){fprintf(stderr,"conservation calculation strategy(-c):\n0,entropy;1,variance;2,sumofpairs\n");
                    exit(0);}
        if((ARG_M>2)||(ARG_M<0)){fprintf(stderr,"matrix transform(-m):\n0, no transform;1,normalization;2,adjustment\n");
                    exit(0);}
	if((ARG_G>1.0)||(ARG_G<=0)){fprintf(stderr,"gap percentage(-g) to eliminate a column must be no more than 1 and more than 0 \n");
		    exit(0);}

	/* read alignments and skip the columns with gaps in the first sequence*/

	readali(ARG_I);
	if(aseq==NULL){fprintf(stderr, "aseq file %s not readable\n", ARG_I); }

	aseq1 = cmatrix(0, nal, 0, alilen);
	jnogapQ = 0;
	for (j=0;j<alilen;j++) {
		if (aseq[0][j]=='-' || aseq[0][j]=='.') {continue;}
		else {
			for (i=0; i<nal; i++) {aseq1[i][jnogapQ] = aseq[i][j];}
			jnogapQ ++;
		}
	}
	
	alilen_nogapQ = jnogapQ;

/* Purge one of seqs from pairs with >threshold identity for all threshold values, leave QUERY */
	mark_sim = markSimilarSeqs (aseq1, alilen_nogapQ, nal, idthresh, IDTHRESH_NUM);
	
	k=0;
		aseq2 = cmatrix(0, nal, 0, alilen_nogapQ);
		astart2 = ivector(0,nal);
		aname2 = cmatrix(0, nal, 0, 100); 
		
		i_nosim = 0;
		for (i=0; i<nal; i++) {
			if (mark_sim[k][i]==1) {continue;}
			else {
				for (j=0;j<alilen_nogapQ;j++) {	aseq2[i_nosim][j] = aseq1[i][j];}
				astart2[i_nosim] = astart[i];
				strcpy(aname2[i_nosim], aname[i]); 
	
				i_nosim ++;
			}
		}
		
		nal_nosim = i_nosim;

		
		printali(ARG_O, ARG_B, nal_nosim, alilen_nogapQ, aname2, aseq2, astart2);

		fprintf(stderr, "Finished...");
		
/*		free_cmatrix (aseq2, 0, nal, 0, alilen_nogapQ);

		free_ivector(astart2, 0, nal);
		free_cmatrix (aname2, 0, nal, 0, 100);
		free_cvector(outfile,0,200);

	
	free_cmatrix (aseq, 0, nal, 0, alilen);
	free_cmatrix (aseq1, 0, nal, 0, alilen);

	free_ivector(astart, 0, nal);
	free_imatrix (aname,0, nal, 0, 100);
*/	
	exit(0);
}


	
void argument()
{
fprintf(stderr,"      prep_psiblastali arguments:\n");
fprintf(stderr,"\n");
fprintf(stderr,"  -i    Input alignment file [File in]\n");
fprintf(stderr,"\n");
fprintf(stderr,"  -o    Output file with processed alignment, where\n");
fprintf(stderr,"        all columns with gaps in top (query) sequence are deleted;\n");
fprintf(stderr,"        any sequence identical to QUERY is purged;\n");
fprintf(stderr,"        only 1 copy is retained of any sequences that have identity >97%\n");

}

#define NR_END 1

void nrerror(char error_text[]){
fprintf(stderr,"%s\n",error_text);
fprintf(stderr,"FATAL - execution terminated\n");
exit(1);
}


char *cvector(long nl, long nh){
char *v;
v=(char *)malloc((size_t) ((nh-nl+1+NR_END)*sizeof(int)));
if (!v) nrerror("allocation failure in ivector()");
return v-nl+NR_END;
}


int *ivector(long nl, long nh){
int *v;
v=(int *)malloc((size_t) ((nh-nl+1+NR_END)*sizeof(int)));
if (!v) nrerror("allocation failure in ivector()");
return v-nl+NR_END;
}

/**** DUMP IN FAVOR OF NRUTIL.H ****/

long *lvector(long nl, long nh){
long int *v;
v=(long int *)malloc((size_t) ((nh-nl+1+NR_END)*sizeof(long int)));
if (!v) nrerror("allocation failure in lvector()");
return v-nl+NR_END;
}

double *dvector(long nl, long nh){
double *v;
v=(double *)malloc((size_t) ((nh-nl+1+NR_END)*sizeof(double)));
if (!v) nrerror("allocation failure in dvector()");
return v-nl+NR_END;
}

char **cmatrix(long nrl, long nrh, long ncl, long nch){
long i, nrow=nrh-nrl+1,ncol=nch-ncl+1;
char **m;
m=(char **)malloc((size_t)((nrow+NR_END)*sizeof(char*)));
if (!m) nrerror("allocation failure 1 in cmatrix()");
m += NR_END;
m -= nrl;

m[nrl]=(char *)malloc((size_t)((nrow*ncol+NR_END)*sizeof(char)));
if (!m[nrl]) nrerror("allocation failure 2 in cmatrix()");
m[nrl] += NR_END;
m[nrl] -= ncl;

for(i=nrl+1;i<=nrh;i++) m[i]=m[i-1]+ncol;

return m;

}

int **imatrix(long nrl, long nrh, long ncl, long nch){
long i, nrow=nrh-nrl+1,ncol=nch-ncl+1;
int **m;
m=(int **)malloc((size_t)((nrow+NR_END)*sizeof(int*)));
if (!m) nrerror("allocation failure 1 in imatrix()");
m += NR_END;
m -= nrl;

m[nrl]=(int *)malloc((size_t)((nrow*ncol+NR_END)*sizeof(int)));
if (!m[nrl]) nrerror("allocation failure 2 in imatrix()");
m[nrl] += NR_END;
m[nrl] -= ncl;

for(i=nrl+1;i<=nrh;i++) m[i]=m[i-1]+ncol;

return m;

}

/*** TO DO : rewrite it in a usual (cycle) way ***/
double **dmatrix(long nrl, long nrh, long ncl, long nch){
long i, nrow=nrh-nrl+1,ncol=nch-ncl+1;
double **m;
m=(double **)malloc((size_t)((nrow+NR_END)*sizeof(double*)));
if (!m) nrerror("allocation failure 1 in dmatrix()");
m += NR_END;
m -= nrl;

m[nrl]=(double *)malloc((size_t)((nrow*ncol+NR_END)*sizeof(double)));
if (!m[nrl]) nrerror("allocation failure 2 in dmatrix()");
m[nrl] += NR_END;
m[nrl] -= ncl;

for(i=nrl+1;i<=nrh;i++) m[i]=m[i-1]+ncol;

return m;
}


void free_ivector(int *v, long nl, long nh)
/* free an int vector allocated with ivector() */
{
	free((FREE_ARG) (v+nl-NR_END));
}

void free_cvector(char *v, long nl, long nh)
/* free an unsigned char vector allocated with cvector() */
{
	free((FREE_ARG) (v+nl-NR_END));
}

void free_dvector(double *v, long nl, long nh)
/* free a double vector allocated with dvector() */
{
	free((FREE_ARG) (v+nl-NR_END));
}



void free_dmatrix(double **m, long nrl, long nrh, long ncl, long nch)
/* free a double matrix allocated by dmatrix() */
{
	free((FREE_ARG) (m[nrl]+ncl-NR_END));
	free((FREE_ARG) (m+nrl-NR_END));
}

void free_imatrix(int **m, long nrl, long nrh, long ncl, long nch)
/* free an int matrix allocated by imatrix() */
{
	free((FREE_ARG) (m[nrl]+ncl-NR_END));
	free((FREE_ARG) (m+nrl-NR_END));
}

void free_cmatrix(char **m, long nrl, long nrh, long ncl, long nch)
/* free an int matrix allocated by cmatrix() */
{
	free((FREE_ARG) (m[nrl]+ncl-NR_END));
	free((FREE_ARG) (m+nrl-NR_END));
}





int am2num(c)
{
switch (c) {
           	 case 'W': case 'w':
                	c=1; break;
           	 case 'F': case 'f':
                	c=2; break;
           	 case 'Y': case 'y':
                	c=3; break;
           	 case 'M': case 'm':
                	c=4; break;
           	 case 'L': case 'l':
                	c=5; break;
           	 case 'I': case 'i':
          		c=6; break;
           	 case 'V': case 'v':
           		c=7; break;
          	 case 'A': case 'a': 
			c=8; break;
           	 case 'C': case 'c':
                	c=9; break;
		 case 'G': case 'g':
			c=10; break;
           	 case 'P': case 'p':
             	 	c=11; break;
       		 case 'T': case 't':
			c=12; break;
	         case 'S': case 's':
			c=13; break;
           	 case 'N': case 'n':
                	c=14; break;
           	 case 'Q': case 'q':
                	c=15; break;
           	 case 'D': case 'd':
                	c=16; break;
           	 case 'E': case 'e':
                	c=17; break;
           	 case 'H': case 'h':
                	c=18; break;
           	 case 'R': case 'r':
                	c=19; break;
           	 case 'K': case 'k':
                	c=20; break;
           	 default : 
			c=0; 
		}
return (c);
}


int am2numBZX(c)
{
switch (c) {
                 case 'W': case 'w':
                        c=1; break;
                 case 'F': case 'f':
                        c=2; break;
                 case 'Y': case 'y':
                        c=3; break;
                 case 'M': case 'm':
                        c=4; break;
                 case 'L': case 'l':
                        c=5; break;
                 case 'I': case 'i':
                        c=6; break;
                 case 'V': case 'v':
                        c=7; break;
                 case 'A': case 'a':
                        c=8; break;
                 case 'C': case 'c':
                        c=9; break;
                 case 'G': case 'g':
                        c=10; break;
                 case 'P': case 'p':
                        c=11; break;
                 case 'T': case 't':
                        c=12; break;
                 case 'S': case 's':
                        c=13; break;
                 case 'N': case 'n':
                        c=14; break;
                 case 'Q': case 'q':
                        c=15; break;
                 case 'D': case 'd':
                        c=16; break;
                 case 'E': case 'e':
                        c=17; break;
                 case 'H': case 'h':
                        c=18; break;
                 case 'R': case 'r':
                        c=19; break;
                 case 'K': case 'k':
                        c=20; break;
                 case 'B': case 'b':
                        c=21; break;
                 case 'Z': case 'z':
                        c=22; break;
                 case 'X': case 'x':
                        c=23; break;
                 case '*':
                        c=24; break;
                 default :
                        c=0;
                }
return (c);
}

static char str[MAXSTR+1];

char **aname, **aseq;
int nal, alilen, *astart, *alen;
int **alignment;



static void *mymalloc(int size);
char *strsave(char *str);
char *strnsave(char *str, int l);
static char **incbuf(int n, char **was);
static int *incibuf(int n, int *was);

void readali(char *filename);
int **ali_char2int(char **aseq,int start_num, int start_seq);
int **read_alignment2int(char *filename,int start_num,int start_seq);

void counter(int b);
double effective_number(int **ali, int *marks, int n, int start, int end);
double effective_number_nogaps(int **ali, int *marks, int n, int start, int end);
double effective_number_nogaps_expos(int **ali, int *marks, int n, int start, int end, int pos);



static void *mymalloc(size)
int size;
{
	void *buf;

	if ((buf = malloc(size)) == NULL) {
		fprintf(stderr, "Not enough memory: %d\n", size);
		exit(1);
	}
	return buf;
}

char *strsave(str)
char *str;
{
	char *buf;
	int l;

	l = strlen(str);
	buf = mymalloc(l + 1);
	strcpy(buf, str);
	return buf;
}

char *strnsave(str, l)
char *str;
int l;
{
	char *buf;

	buf = mymalloc(l + 1);
	memcpy(buf, str, l);
	buf[l] = '\0';
	return buf;
}

static char **incbuf(n, was)
int n;
char **was;
{
	char **buf;
	char *aaa;

	buf = mymalloc((n+1) * sizeof(buf[0]));
	if (n > 0) {
		memcpy(buf, was, n * sizeof(was[0]));
		free(was);
	}
	buf[n] = NULL;
	return buf;
}

static int *incibuf(n, was)
int n, *was;
{
	int *ibuf;

	ibuf = mymalloc((n+1) * sizeof(ibuf[0]));
	if (n > 0) {
		memcpy(ibuf, was, n * sizeof(was[0]));
		free(was);
	}
	ibuf[n] = 0;
	return ibuf;
}
void err_readali(int err_num)
{
	fprintf(stderr,"Error with reading alignment: %d\n",err_num);
}

void readali(filename)
char *filename;
{
	FILE *fp;
	char *s, *ss, *seqbuf;
	int n, l, len, len0;
	int ii,mark=1;

	if ((fp = fopen(filename, "r")) == NULL) {
		fprintf(stderr, "No such file: \"%s\"\n", filename);
		err_readali(1);
		;exit(1);
	}
	
	alilen = 0;
	nal = 0;
	n = 0;
	if(fgets(str, MAXSTR, fp) != NULL) {
		if(strncmp(str,"CLUSTAL W",9)!=0){rewind(fp);}
					}
					
					
	while (fgets(str, MAXSTR, fp) != NULL) {
		for (ss = str; isspace(*ss); ss++) ;
		if ((ii<=ss-str)&&(mark==0)) {continue;}
		if (*ss == '\0') {
			if (n == 0) {
				continue;
			}
			if (nal == 0) {
				if (n == 0) {
					fprintf(stderr, "No alignments read\n");
					err_readali(2);
					exit(1);
				}
				nal = n;
			} else if (n != nal) {
				fprintf(stderr, "Wrong nal, was: %d, now: %d\n", nal, n);
				err_readali(3); exit(1);
			}
			n = 0;
			continue;
		}
		for (s = ss; *s != '\0' && !isspace(*s); s++) ;
		*s++ = '\0';
		if (nal == 0) {
			astart = incibuf(n, astart);
			alen = incibuf(n, alen);
			aseq = incbuf(n, aseq);
			aname = incbuf(n, aname);
			aname[n] = strsave(ss);
		} else {
			if (n < 0 || n >= nal) {
				fprintf(stderr, "Bad sequence number: %d of %d\n", n, nal);
				err_readali(4);  exit(1);
			}
			if (strcmp(ss, aname[n]) != 0) {
				fprintf(stderr, "Names do not match");
				fprintf(stderr, ", was: %s, now: %s\n", aname[n], ss);
				err_readali(5);  exit(1);
			}
		}
		for (ss = s; isspace(*ss); ss++);
		if(mark==1){
		ii = ss-str;
		mark=0;}
		
		for (s = ss; isdigit(*s); s++) ;
		if (isspace(*s)) {
			if (nal == 0) {
				astart[n] = atoi(ss);
			}
			for (ss = s; isspace(*ss); ss++);
		}
		for (s = ss, len=0, l = 0; *s != '\0' && !isspace(*s); s++) {
			if (isalpha(*s)) {
				l++;
			}
		
/*** Calculate len -- the full number of aa and gaps, excluding position numbers in the end ***/			
			
			if (isalpha(*s) || *s == '-' || *s == '.') {
				len++;
			}
		
		
		}
		
/****		len = s - ss;  *************/
		
		
		
		if (n == 0) {
			len0 = len;
			alilen += len;
		} else if (len != len0) {
			fprintf(stderr, "wrong len for %s", aname[n]);
			fprintf(stderr, ", was: %d, now: %d\n", len0, len);
			err_readali(6); exit(1);
		}
		alen[n] += l;
		if (aseq[n] == NULL) {
			aseq[n] = strnsave(ss, len);
		} else {
			seqbuf = mymalloc(alilen+1);
			memcpy(seqbuf, aseq[n], alilen-len);
			free(aseq[n]);
			aseq[n] = seqbuf;
			memcpy(seqbuf+alilen-len, ss, len);
			seqbuf[alilen] = '\0';
		}
		n++;
	}
	if (nal == 0) {
		if (n == 0) {
			fprintf(stderr, "No alignments read\n");
			err_readali(7);  exit(1);
		}
		nal = n;
	} else if (n != 0 && n != nal) {
		fprintf(stderr, "Wrong nal, was: %d, now: %d\n", nal, n);
		err_readali(8);  exit(1);
	}
	fclose(fp);
}


/* In alignment, marks sequences that have very similar "doubles" (%id > idthr) and that have to be purged */
/* Make an array of mark strings for all of nthr id thresholds
/* Array of thesholds idthr must be sorted from min to max */
/* Since PSI BLAST-generated alignments are considered, the same aa alphabet over rhe whole alignment
is implied (e.g., capital letters for aa's and '-' for gaps */
 
int **markSimilarSeqs (char **aseq, int len, int n, int *idthr, int nthr)
{
	int i,j,k,l, flag;
	int ndif;
	double *min_dif;
	int **mark;
	
	min_dif = dvector(0,nthr);
	for (i=0;i<nthr;i++) { min_dif[i] = 0.01*(100-idthr[i])*len;}
	
	mark = imatrix(0,nthr,0,n);
	for (i=0;i<nthr;i++){
		for (j=0;j<n;j++) {mark[i][j] = 0;}
	}
			
	for (i=0;i<n;i++) {
/*		fprintf (stderr, "i=%d\n", i); */
	 
		for (j=i+1;j<n;j++) {
/*			fprintf (stderr, "j=%d\n", j); */
			if (mark[nthr-1][j] == 1) {continue;}

			
			flag=0;
			ndif = 0;
			for (k=0;k<len;k++) {
				if (aseq[i][k] != aseq[j][k]) {
					ndif++;
					if (ndif>=min_dif[0]) {flag=1; break;}
				}
			}
			if (flag==0) {
				for (l=0;l<nthr;l++) {
					if (ndif<min_dif[l]) {mark[l][j] = 1;}
				}
			}
		}
	}
	
	return mark;
}



/*** Print ali to file ****/
static void printali(char *argo, int chunk, int n, int len, char **aname, char **aseqGap, int *start)
{
        int i, j, k, jj, mlen, str_len, len_start, sta1, sta2;

	char arg_o[100], namebuf[100];
        char *sq;
	int *isq;
	char *sqn;
	FILE *fpp;
	strcpy(arg_o,argo);
	fpp=fopen(arg_o,"w");

/************** Dlia chego eto???? */
        for (i=0; i < n && start[i] == 0; i++) ;
        sta2 = (i < n);
        for (i=1, mlen=strlen(aname[0]); i < n; i++) {
                if (mlen < strlen(aname[i])) {
                        mlen = strlen(aname[i]);
                }
        }


        jj = 0;

        do {

/* Print the chunk of the first alignment */
                if (jj == 0) {fprintf(fpp, "\n");}
			                        
                for (i=0; i < n; i++) {
     			strcpy(namebuf,aname[i]);
     			fprintf(fpp, namebuf);
     			str_len = strlen(aname[i]);
     			for(k=str_len;k<mlen+3;k++) fprintf(fpp," ");
     			
    			if (jj==0) {
    				if (start[i]==0) {len_start=1;}
    				else {len_start = log(start[i])/log(10.0)+1;}
		     		fprintf(fpp, "%d", start[i]);
     			} else {len_start=0;}
     			
 			for(k=len_start;k<7;k++) fprintf(fpp," ");       			
     					
                        sq = aseqGap[i] + jj;
	
                        for (j=0; j+jj <len && j < chunk; j++) {
				 fprintf(fpp, "%c", sq[j]);
                        }
                       fprintf(fpp, "\n");
                }
			
                fprintf(fpp, "\n");

                jj += chunk;

        } while (jj < len);
	
	fclose(fpp);
}

int **ali_char2int(char **aseq, int start_num, int start_seq){
/* fills the alignment ali[start_num..start_num+nal-1][start_seq..start_seq+alilen-1]
convetring charater to integer from aseq[0..nal-1][0..alilen-1]
*/

int i,j,end_num,end_seq;
int **ali;
end_num=start_num+nal-1;
end_seq=start_seq+alilen-1;
ali=imatrix(start_num,end_num,start_seq,end_seq);
for(i=start_num;i<=end_num;++i)for(j=start_seq;j<=end_seq;++j)ali[i][j]=am2num(aseq[i-start_num][j-start_seq]);
return ali;
}

int **read_alignment2int(char *filename,int start_num,int start_seq){
int **ali;
readali(filename);
ali=ali_char2int(aseq,start_num,start_seq);
return ali;
}

