/***************************************
 * copyright (c) Vanden Berghen Frank  *
 * V 1.2                               *
 * *************************************/
 
/*************************************************************************/
/*                                     */
/*    Prune a decision tree and predict its error rate         */
/*    ------------------------------------------------         */
/*                                     */
/*************************************************************************/

#include <math.h>
#include <stdlib.h>
#include <stdio.h>
#include <memory.h>
#include "C45.h"
//#include <crtdbg.h>

extern double CF; // confidence level

static Boolean  Changed, bWeight;
static double   **Item;
static ItemNo	 MaxItem;
static Attribute MaxAtt;	/* max att number */
static ClassNo	 MaxClass;
#define  CVal(Case,Attribute)   Item[Case][Attribute]
#define  Class(Case)			(*((ClassNo*)(Item[Case]+MaxAtt)))
#define  weight(Case)			(*((ItemCount*)(Item[Case]+MaxAtt+1)))

#define    LocalVerbosity(x)    if (Sh >= 0 && 0)

double confidenceLevel(double error, double nItems);
	
void Intab(short Sh)
{
    printf("\n");
    while ( Sh-- ) printf("| ");
};

/*************************************************************************/
/*                                                                    */
/*        Exchange items at a and b                  */
/*                                     */
/*************************************************************************/

static void Swap(ItemNo a,ItemNo b)
{
    register double *Hold;

    Hold = Item[a];
    Item[a] = Item[b];
    Item[b] = Hold;
};

						
/*************************************************************************/
/*                                                                       */
/*  Compute the additional errors if the error rate increases to the	 */
/*  upper limit of the confidence level.  The coefficient is the	     */
/*  square of the number of standard deviations corresponding to the	 */
/*  selected confidence level.  (Taken from Documenta Geigy Scientific	 */
/*  Tables (Sixth Edition), p185 (with modifications).)			         */
/*									                                     */
/*************************************************************************/


double Val[] = {  0,  0.001, 0.005, 0.01, 0.05, 0.10, 0.20, 0.40, 1.00},
       Dev[] = {4.0,  3.09,  2.58,  2.33, 1.65, 1.28, 0.84, 0.25, 0.00};


double AddErrs(ItemCount N, ItemCount e)
// N training cases covered by this leaf with
// E of them incorrectly
// AddErrs= Ucf(E,N)
{
    static double Coeff=0;
    double Val0, Pr;

    if ( ! Coeff )
    {
		/*  Compute and retain the coefficient value, interpolating from
			the values in Val and Dev  */

		int i;

		i = 0;
		while ( CF > Val[i] ) i++;

		Coeff = Dev[i-1] +
			  (Dev[i] - Dev[i-1]) * (CF - Val[i-1]) /(Val[i] - Val[i-1]);
		Coeff = Coeff * Coeff;
	}

	if ( e < 1E-6 )
	{
		return N * (1 - exp(log(CF) / N));
	} else if ( e < 0.9999 )
		{
			Val0 = N * (1 - exp(log(CF) / N));
			return Val0 + e * (AddErrs(N, 1) - Val0);
		}
		else if ( e + 0.5 >= N )
			{
				return 0.67 * (N - e);
			}
			else
			{
				Pr = (e + 0.5 + Coeff/2
						+ sqrt(Coeff * ((e + 0.5) * (1 - (e + 0.5)/N) + Coeff/4)) )
						 / (N + Coeff);
				return (N * Pr - e);
			}
};


/************************************************/
/*												*/
/*    Estimate the errors in a given subtree    */
/*												*/
/************************************************/

double EstimateErrors(Node *T, ItemNo Fp, ItemNo Lp, short Sh, Boolean UpdateTree)
{ 
    ItemNo i, Ep, Group();
    ItemCount Cases, *LocalClassDist, LeafErrors;
    Node *biggestT;
    ClassNo c, BestClass;
	double ExtraLeafErrors,Thresh,TreeErrors,BranchErrors;
	int Att;

    /*  Generate the class frequency distribution  */

    LocalClassDist = (ItemCount *) calloc(MaxClass+1, sizeof(ItemCount));
    for (c=0;  c<MaxClass; c++) LocalClassDist[c] = 0.0;
    if (bWeight)
    {
        Cases=0;
        for (i=Fp; i<Lp;       i++) 
        {
            Cases+=weight(i);
            LocalClassDist [ Class(i) ]+=weight(i);
        }
    } else
    {
        Cases = Lp-Fp;
        for (i=Fp; i<Lp;       i++) LocalClassDist [ Class(i) ]++;
    } 

    /*  Find the most frequent class and update the tree  */

    BestClass = T->Leaf;
    for(c=0; c<MaxClass;  c++)
        if (LocalClassDist[c]>LocalClassDist[BestClass]) BestClass = c;

	LeafErrors = Cases - LocalClassDist[BestClass];
    ExtraLeafErrors = AddErrs(Cases, LeafErrors);

    if ( UpdateTree )
    {
        T->Items = Cases;
        T->Leaf  = BestClass;
        memcpy(T->ClassDist, LocalClassDist, MaxClass* sizeof(ItemCount));
    }

    if ( ! T->NodeType )    /*  leaf  */
    {
        TreeErrors = LeafErrors + ExtraLeafErrors;

        if ( UpdateTree )
        {
            T->Errors = TreeErrors;

            LocalVerbosity(1)
            {
                Intab(Sh);
                    printf(" (%f:%f/%.2f)\n", 
                        T->Items, LeafErrors, T->Errors);
            }
            T->Confidence  = confidenceLevel(T->Errors,T->Items);
        }

        free(LocalClassDist);

        return TreeErrors;
    }

    //  Estimate errors for each branch
	
		//  Group items on the value of attribute Att.
	Thresh=T->Cut;
	Att=T->Tested;
	Ep=Fp;
	while ((CVal(Ep,Att)<=Thresh)&&(Ep<Lp)) Ep++;
	for(i=Ep+1; i<Lp; i++)
		if (CVal(i, Att)<=Thresh) { Swap(Ep, i); Ep++; };

		// do the recursion
	if (Fp<Ep)
	{
		if (Ep-1-Fp>Lp-Ep+1) biggestT=T->lower; else biggestT=T->upper;
        TreeErrors = EstimateErrors(T->lower, Fp, Ep, Sh+1, UpdateTree)+
					 EstimateErrors(T->upper, Ep, Lp, Sh+1, UpdateTree);
	};

    if ( ! UpdateTree )
    {
        free(LocalClassDist);
        return TreeErrors;
    }

    /*  See how the largest branch would fare  */

	BranchErrors = EstimateErrors(biggestT, Fp, Lp, -1000, false);

    LocalVerbosity(1)
    {
        Intab(Sh);
        printf(":  [%d%%  N=%.2f  tree=%.2f  leaf=%.2f+%.2f  br[??]=%.2f]\n",
        (int) ((TreeErrors * 100) / (T->Items + 0.001)),
        T->Items, TreeErrors, LeafErrors, ExtraLeafErrors,
        BranchErrors);
    }

    //  See whether tree should be replaced with leaf 
    if ( LeafErrors + ExtraLeafErrors <= BranchErrors + 0.1 &&
         LeafErrors + ExtraLeafErrors <= TreeErrors + 0.1 )
    {
        LocalVerbosity(1)
        {
            Intab(Sh);
        }

        T->NodeType = 0;
        T->Errors = LeafErrors + ExtraLeafErrors;
        T->Confidence  = confidenceLevel(T->Errors,T->Items);
        Changed = true;
		free(LocalClassDist);
	    return T->Errors;
    };

	// See whether tree should be replaced with largest branch  */
    if ( BranchErrors <= TreeErrors + 0.1 )
    {
        LocalVerbosity(1)
        {
            Intab(Sh);
            printf("Replaced with branch ?\n");
        };

        EstimateErrors(biggestT, Fp, Lp, Sh, true);
        memcpy((char *) T, (char *) biggestT, sizeof(Node));
		// free(biggestT);
        Changed = true;
        T->Confidence  = confidenceLevel(T->Errors,T->Items);
		free(LocalClassDist);
	    return T->Errors;
    }

	T->Errors = TreeErrors;
    T->Confidence  = confidenceLevel(T->Errors,T->Items);
    free(LocalClassDist);
    return T->Errors;
};


/****************************************************************/
/*																*/
/*  Prune tree T, returning true if tree has been modified      */
/*																*/
/****************************************************************/

Boolean Prune(Node *T,double **_Item, ItemNo _nItem,int _MaxClass, 
				int _MaxAtt, Boolean _bWeight)
{
    bWeight=_bWeight;
	Item=_Item;
	MaxItem=_nItem;
	MaxAtt=_MaxAtt;
	MaxClass=_MaxClass;
    Verbosity(1) printf("\n");
    Changed = false;
    EstimateErrors(T, 0, MaxItem, 0, true);
    return Changed;
};


/************************************/
/*									*/
/*  Estimate error rate on tree T   */
/*									*/
/************************************/

double ErrorRate(Node *T,double **_Item, ItemNo _MaxItem)
{
	Item=_Item;
	MaxItem=_MaxItem;
	return EstimateErrors(T,0,MaxItem,0,false);
};
