/***************************************
 * copyright (c) Vanden Berghen Frank  *
 * V 1.2                               *
 * *************************************/

#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <string.h>
#include <math.h>
#include "definitions.h"
#include "textUtils.h"
#include "tools.h"

#define __inside_datasetcpp__
#include "DataSet.h"
#undef __inside_datasetcpp__

#ifdef __NO_DATASET__
#define __EVAL_ONLY__
#else

#define LINELENGTH 10000
//#include <crtdbg.h>
// #define ClassTest(Case)	 (*((ClassNo*)(ItemTest.d[Case]+MaxAtt)))

DataSet::DataSet():Item(NULL),nItem(0),iw(0),MaxAtt(0),MaxClass(0),MinClass(0),
OldCrossValTot(-1.0),StartOfItemSpace1(NULL)
{};

#define RealClass(Case) (*((ClassNo*)(Item[Case]+MaxAtt)))
void DataSet::save(FILE *f)
{
    double **Item=this->Item;
    int ni=nItem, MaxAtt=this->MaxAtt,i,j;
    for (i=0; i<ni; i++)
    {
        fprintf(f,"id ");
        for (j=0; j<MaxAtt; j++) fprintf(f,"%e ",Item[i][j]);
        fprintf(f,"%i\n", RealClass(i)+MinClass);
    }
    fflush(f);
}

DataSet::DataSet(char *filename, int w): nItem(0), iw(w), OldCrossValTot(-1)
{
	FILE *stream;
	char line[LINELENGTH],*tline;
    int k,ItemSpace=50;
    double *g,*h,d;
	ClassNo b;
	
	MaxClass=0; MinClass=255;
    if ((stream=fopen(filename,"r"))==NULL)
    {
        fprintf(stderr,"data file not found.\n");
        exit(255);
    };

    while ((fgets(line,LINELENGTH,stream)!=NULL)&&(emptyline(line)));
    tline=line; while (*tline!=' ') tline++;
    MaxAtt=lire_Number_of_Attributes(tline);
    if (MaxAtt==0)
    {
        fprintf(stderr,"wrong data file.\n"); exit(253);
    };

	Verbosity(1)
		printf("file name '%s' is open.\nnumber of columns=%i\n",filename,MaxAtt);

    g=h=(double*)malloc(ItemSpace*(MaxAtt+w)*sizeof(double));

    do
    {
		if (emptyline(line)) continue;
	    if (nItem==ItemSpace)
	    {
	       ItemSpace+=50;
	       h=(double*)realloc(h,ItemSpace*(MaxAtt+w)*sizeof(double));
	       g=h+nItem*(MaxAtt+w);
	    };
		tline=line; while (*tline!=' ') tline++;
 	    for (k=0; k<MaxAtt-1; k++) { *g=lire_double(&tline); g++; };
        d=lire_double(&tline);
	    if ((d<0)||(d>255))
	    {
	       fprintf(stderr,"wrong class number.\n"); exit(254);
	    };
		b=(ClassNo)d;
	    *((ClassNo*)g)=b; g+=w+1; nItem++;
		MaxClass=MAX(b,MaxClass);
		MinClass=MIN(b,MinClass);
	} while ((fgets(line,LINELENGTH,stream)!=NULL));

    if (nItem!=ItemSpace)
       h=(double*)realloc(h,nItem*(MaxAtt+w)*sizeof(double));
	
	fclose(stream);

    Item=(double**)malloc(nItem*sizeof(double*)); 
	StartOfItemSpace1=h;
	MaxAtt--;
	MaxClass+=1-MinClass;

    for (k=0; k<nItem; k++) 
	{ 
		Item[k]=h; 
		h+=MaxAtt+w;
		*((ClassNo*)h)-=MinClass;
		h++;
	};

/*
	if (MinClass!=0)
	{
		fprintf(stderr,"Attention: MinClass=%i\n"
				"The outputs of the trees may not match intput data file\n"
				"(a shift in the value is simply introduced).\n",
				MinClass);
	};
*/
//	_CrtCheckMemory( );


};

DataSet::~DataSet()
//this function free up the memory space taken by the datas 
{
	if (StartOfItemSpace1) free(StartOfItemSpace1); 
    free(Item);
};


// Gnre ItemCreate (donnes utilises pour la cration des arbres) et ItemTest
// (donnes utilises pour la validation des arbres)
// Ces 2 sets de donnes sont crs  partir du set de donnes global Orig.
// ItemTest contient 1/CrossValTot du set de donnes de dpart (qui est Orig).
// C'est la Pme partie de Orig qui est dans ItemTest.
// ItemCreate est Orig moins ItemTest.

void DataSet::equilibrate(double CrossValTot)
{
//    return;
    if (CrossValTot==OldCrossValTot) return;
    OldCrossValTot=CrossValTot;
    if (CrossValTot<=1) 
    {
		fprintf(stderr,"error: bad value %f for CrossValTot\n",CrossValTot);
		exit(255);
    }
    // calculate distribution:
    int *distribution=(int*)malloc(5*MaxClass*sizeof(int)), 
        *localdist=distribution+MaxClass,
        *pItem=localdist+MaxClass,
        *totOccurence=pItem+MaxClass,
        *occurence=totOccurence+MaxClass,
        i,j,k,l,diff=0, ldiff, current=0;
    memset(totOccurence,0,MaxClass*sizeof(int));
    memset(occurence,0,MaxClass*sizeof(int));
    memset(pItem,0,MaxClass*sizeof(int));
    for (k=0; k<nItem; k++) totOccurence[RealClass(k)]++;
    for (k=0; k<MaxClass; k++) 
    {
        distribution[k]=(int)(totOccurence[k]/CrossValTot); 
        diff+=distribution[k];
    }
    diff=(int)(nItem/CrossValTot)-diff;
    
    double **item2=(double**)malloc(nItem*sizeof(double*));
    j=0;
    for (i=0; i<CrossValTot-1; i++)
    {
        memcpy(localdist,distribution,MaxClass*sizeof(int));
        ldiff=diff;
        while (ldiff>0) 
        { 
            if (occurence[j]+localdist[j]<totOccurence[j]) { localdist[j]++; ldiff--; }
            j++; if (j==MaxClass) j=0;
        }

        for (k=0; k<MaxClass; k++) 
        {
            occurence[k]+=localdist[k];
            l=pItem[k];
            while (localdist[k]>0) 
            {
                if (RealClass(l)==k)
                {
                    item2[current]=Item[l];
                    current++;
                    localdist[k]--;
                }
                l++;
                if (l==nItem) break;
            }
            pItem[k]=l;
        }
    }
    for (k=0; k<MaxClass; k++)
    {
        l=pItem[k];
        while (l<nItem) 
        {
            if (RealClass(l)==k)
            {
                item2[current]=Item[l]; current++; 
            }
            l++; 
        };
    }
    free(Item);
    Item=item2;
    free(distribution);
}

DataSet *DataSet::generate_Create_Set(int p, double CrossValTot)
{
    equilibrate(CrossValTot);
	DataSet *ItemCreate= new DataSet();
	int NumberItemTest=(int)(nItem/CrossValTot);

    ItemCreate->iw=iw;
	ItemCreate->MaxAtt=MaxAtt;
	ItemCreate->MaxClass=MaxClass;
	ItemCreate->MinClass=MinClass;
	ItemCreate->StartOfItemSpace1=NULL;
	ItemCreate->nItem=nItem-NumberItemTest;
	ItemCreate->Item=(double**)malloc(ItemCreate->nItem*sizeof(double**));

	if (p) memcpy(ItemCreate->Item,
				  Item,
		          p*NumberItemTest*sizeof(double**));
	memcpy(ItemCreate->Item+p*NumberItemTest,
		   Item+(p+1)*NumberItemTest,
		   (nItem-(p+1)*NumberItemTest)*sizeof(double**));
	return ItemCreate;
};

DataSet *DataSet::generate_Test_Set(int p, double CrossValTot)
{
    equilibrate(CrossValTot);
	DataSet *ItemTest= new DataSet();
	int NumberItemTest=(int)(nItem/CrossValTot);

	ItemTest->MaxAtt=MaxAtt;
	ItemTest->MaxClass=MaxClass;
	ItemTest->MinClass=MinClass;
	ItemTest->nItem=NumberItemTest;
	ItemTest->Item=(double**)malloc(ItemTest->nItem*sizeof(double**));

	memcpy(ItemTest->Item,
		   Item+p*NumberItemTest,
		   NumberItemTest*sizeof(double**));
	return ItemTest;
};

DataSet *DataSet::generate_Bootstrap()
{
	DataSet *bs=new DataSet();
	int i;
	
    bs->iw=iw;
	bs->MaxAtt=MaxAtt;
	bs->MaxClass=MaxClass;
	bs->MinClass=MinClass;
	bs->nItem=nItem;
	bs->Item=(double**)malloc(bs->nItem*sizeof(double**));
	
	for (i=0; i<bs->nItem; i++)
		bs->Item[i]=Item[(int)(rand1()*nItem)];

	return bs;
};

DataSet *DataSet::generate_Binary_Set(ClassNo c1, ClassNo c2)
{
    ClassNo c;
	DataSet *bs=new DataSet();
	int i,j=0,MaxAtt=this->MaxAtt;
    double **item=this->Item;
	
    bs->iw=iw;
	bs->MaxAtt=MaxAtt;
	bs->MaxClass=MaxClass;
	bs->MinClass=MinClass;
	bs->Item=(double**)malloc(nItem*sizeof(double**));
	
	for (i=0; i<nItem; i++)
    {
        c=(*((ClassNo*)(item[i]+MaxAtt)));
        if ((c==c1)||(c==c2))
        {
	        bs->Item[j]=Item[i];
            j++;
        }
    }
	bs->nItem=j;
    bs->Item=(double**)realloc(bs->Item,j*sizeof(double**));
    return bs;
}

#endif

#ifndef __EVAL_ONLY__

BinaryDataSet::BinaryDataSet(DataSet *D): nItem(0)
{
    nAtt=D->MaxClass*(D->MaxClass-1)/2;
    nExample=(int)mmin((double)D->nItem, (pow(2,nAtt)*D->MaxClass));
    initMemory();
}

void BinaryDataSet::save(FILE *f)
{
    static const char HEX[]={'0','1','2','3','4','5','6','7','8','9','A','B','C','D','E','F'};
    int *oo=o, i,j;
    unsigned char *dd=d, *cc=c, *tline;
    fprintf(f,"%i\n%i\n", nAtt, nItem);
    tline=(unsigned char*)malloc(lineSize*2);
    for (i=0; i<nItem; i++)
    {
        for (j=0; j<lineSize; j++)
        {
            tline[j<<1]=HEX[(*dd)>>4];
            tline[(j<<1)+1]=HEX[(*dd)&15];
            dd++;
        }
        fwrite(tline,lineSize*2,1,f);
        fprintf(f," %i %03i\n", *(oo++), *(cc++));
    }
    free(tline);
}

void BinaryDataSet::add(unsigned char *nd, ClassNo cd)
{
    unsigned char *dd=d;
    int *oo=o, lsz=lineSize;
    ClassNo *cc=c;
    int i=0;
    for (i=0; i<nItem; i++)
    {
        if (equals(nd,dd)&&(*cc==cd)) { (*oo)++; return; }
        oo++; cc++; dd+=lsz;
    }
    // we have normally already allocated the space but we check anyway...
    if (nItem>=nExample)
    {
        printf("binary dataset error.\n");
        exit(200);
    }
    memcpy(dd,nd,lsz);
    *cc=cd;
    *oo=1;
    nItem++;
}

#endif

BinaryDataSet::BinaryDataSet(FILE *stream)
{
    int i,j, *oo;
    unsigned char buffer[30000],*tline=buffer, *tline2, l, h, *dd, *cc;

    fgets((char *)tline,30000,stream); nAtt=atol((char*)tline);
	fgets((char *)tline,30000,stream); nExample=nItem=atol((char*)tline);
    initMemory();

    dd=d; cc=c; oo=o;
    for (i=0; i<nExample; i++)
    {
        fgets((char *)tline, 30000, stream);
        for (j=0; j<lineSize; j++)
        {
            l=tline[(j<<1)+1]; h=tline[j<<1];
            if (l<'A') l-='0';       else l-='A'-10;
            if (h<'A') h=(h-'0')<<4; else h=(h-'A'+10)<<4;
            *(dd++)=l|h;
        }
        j=0;
        
        tline2=tline+(lineSize<<1)+1;
        while (*tline2!=' ') { j=j*10+*tline2-'0'; tline2++; };
        *(oo++)=j;

        *(cc++)=(tline2[1]-'0')*100+(tline2[2]-'0')*10+(tline2[3]-'0');        
    }
}

BinaryDataSet::~BinaryDataSet()
{
    free(d); free(c);
}

void BinaryDataSet::initMemory()
{
    int i,j=(nAtt-1)%8+1;
    lineSize=(nAtt-1)/8+1;
    endChar=1;
    for (i=1; i<j; i++) endChar|=(endChar<<1);
    d=(unsigned char *)malloc(lineSize*nExample);
    o=(int*)malloc(nExample*sizeof(int));
    memset(o,0,nExample*sizeof(int));
    c=(ClassNo *)malloc(nExample);
}
