#define PROG_NAME "test_nnclassif" #define PROG_DESC "test nearest neihbor classifiers" #define PROG_VERS "1.0" #define make_test_classif_data_C_COPYRIGHT \ "Copyright © 2010 by the State University of Campinas (UNICAMP)" /* Last edited on 2024-12-05 10:24:46 by stolfi */ #define PROG_HELP \ " " PROG_NAME " \\\n" \ " -problem {PROBLEM} \\\n" \ " -seed {SEED} \\\n" \ " -samples {NS_MODEL} {NS_REFINE} {NS_EVAL} \\\n" \ " -prefix {PREFIX} \\\n" \ " [ -attributes {NAR} {NAI} ] \\\n" \ " [ -classes {NC} ] \\\n" \ " [ -noise {SIGMA} ] \\\n" \ " [ -opf { N | Y } ] \\\n" \ " [ -refine {REFINE_ITERS} ] \\\n" \ " [ -swap { same | full } ] \\\n" \ " [ -grid { N | Y } ] \\\n" \ " [ -verify { N | Y } ] \\\n" \ " [ -image {IMG_SIZE} {SUBSMP} ] \\\n" \ " " argparser_help_info_HELP "" #define PROG_INFO \ "NAME\n" \ " " PROG_NAME " - " PROG_DESC "\n" \ "\n" \ "SYNOPSIS\n" \ PROG_HELP "\n" \ "\n" \ "DESCRIPTION\n" \ PROG_INFO_DESCRIPTION "\n" \ "\n" \ "OUTPUTS\n" \ PROG_INFO_OUTPUTS "\n" \ "\n" \ "OPTIONS\n" \ PROG_INFO_OPTIONS "\n" \ "\n" \ "DOCUMENTATION OPTIONS\n" \ argparser_help_info_HELP_INFO "\n" \ "\n" \ "SEE ALSO\n" \ " The San Francisco Exploratorium.\n" \ "\n" \ "SMELL ALSO\n" \ " The roses along the road.\n" \ "\n" \ "AUTHOR\n" \ " Created by Jorge Stolfi, IC-UNICAMP.\n" \ "\n" \ "MODIFICATION HISTORY\n" \ " 2010-05-24 by J. Stolfi: created program.\n" \ "\n" \ "WARRANTY\n" \ argparser_help_info_NO_WARRANTY "\n" \ "\n" \ "RIGHTS\n" \ " " make_test_classif_data_C_COPYRIGHT ".\n" \ "\n" \ argparser_help_info_STANDARD_RIGHTS #define PROG_INFO_DESCRIPTION \ " Tests the nearest-neighbor (1NN) vector classifier, with a model of {NS_MODEL} samples" \ " for a /discrete classification function/ specified by {PROBLEM}, {NAR} and {NC}.\n" \ "\n" \ " Each sample is a vector of attributes. Each attribute is a real number. The total number of" \ " attributes per sample is {NA = NAR + NAI} where {NAR} and {NAI} are" \ " user-specifiable numbers. The first {NAR} attributes are \"relevant\" to the test problem;" \ " the the remaining {NAI} attributes are \"irrelevant\" random numbers, uniformly distributed" \ " in {U = [-1 _ +1]}, appended to them.\n" \ "\n" \ " Considering only the {NAR} relevant attributes, the classification function" \ " defined by {PROBLEM,NAR,NC} define a space of /valid" \ " attribute vectors/ contained in {U^NAR} that is partitioned {NC}" \ " disjoint /class domains/ numbered {1..NC}. The number" \ " and shape of the domains, and the random sampling probability distribution" \ " in each domain depends on the. For some {PROBLEM}s" \ " the number of attributes {NAR} and/or classes {NC} may be restricted or fixed. The class domains" \ " may not cover the whole of {U^NAR} and may or may not touch each other. Some" \ " domains may be zero-measure sets with dimension smaller than {NAR}.\n" \ "\n" \ " The program" \ " generates {NS} samples in these domains (where {NS} is either {NS_MODEL}, {NS_REFINE}," \ " or {NS_EVAL}), and assigns to each sample" \ " a /nominal class/ which is the index of the containing domain. The samples" \ " may be generated at random or may be taken from a regular grid.\n" \ "\n" \ " When sampling on a grid, the number {NS} is implicitly rounded up to the next" \ " perfect power {NG^NAR}, and the problem is sampled at a regular grid of points" \ " with {NAR} samples along each axis and spanning the hypercube {U^NAR}. Only" \ " those samples that fall inside one of" \ " the class domains will be written out. In that case the actual number of" \ " samples in the output may be substantially larger or smaller than {NS}. Note" \ " that the arrangement of samples is regular only in {U^NAR}, not in {U^NA}\n" \ "\n" \ " After generating each sample (whether randomly or from a grid) and" \ " appending the {NAI} irrelevant attributes, the program" \ " adds to each attribute (relevant or irrelevant)" \ " a Gaussian noise with mean 0 and deviation {SIGMA}. The noise is truncated to {4*SIGMA}" \ " so that each final attribute is in the range {V = [-VMAX _ +VMAX]}" \ " where {VMAX = 1 + 4*SIGMA}. The nominal class of the sample is retained," \ " even though the perturbation may cause the" \ " attribute vector to fall outside the domain of that class and" \ " possibly into the domain of a different class.\n" \ "\n" \ " The program generates three datasets, a /model set/ {M} with {NS_MODEL} samples," \ " a /refinement set/ {R} with {NS_REFINE} samples, and an /evaluation set/ {E} with" \ " {NS_EVAL} samples. The model set {M}, always random, is used with the nearest" \ " neighbor algorithm to classify other samples. The evaluation set {E}, either random or grid-like, is used" \ " to check the accuracy of the 1NN classifier with the model. The refinement set {R}, also" \ " random, is used as a source of alternative samples that can be iteratively swapped with those of {M} to" \ " improve the performance of the 1NN classifier.\n" \ "\n" \ " The 1NN algorithm has three parameters: the list of representative samples {M}, a /class" \ " table/ {classM}, and a /handicap table/. For each {s} in {M}," \ " {classM(s)} is a positive class index" \ " and {HM(s)} is a non-negaive real number. The algorithm itself defines another classification function" \ " for {Reals^NA}, namely it assigns to each sample vector {t} in {Reals^NA} the class" \ " {classM(s)} of the sample {s} in {M} that minimizes {max(H(s),dist(t,s))}, where {dist}" \ " is some metric (currently the Euclidian one). The handicaps {H(m)} are either zero," \ " or are computed by the program from {M} and {classM} by the OPF algorithm" \ " (Spina, Falcão and Suzuki, 2009)." #define PROG_INFO_OUTPUTS \ " If \"-image\" is specified, the program will write a set of PPM images" \ " files called \"{PREFIX}-{TAG}-C.ppm\" for various {TAG} ad {KIND} strings. showing the\n" \ " classes of the ideal classifier defined by {PROBLEM,NAR,NC} (with {TAG} = \"I-tru\"), and of the 1NN" \ " classifiers {(D,HD,classD)} where {D} is either {M} ({TAG} = \"M-ini\") or {R} ({TAG} = \"R-ini\")." \ " In any clase {classD} is the labeling of {D} samples defined by the ideal" \ " classifier {PROBLEM,NAR,NC}, and {HD} is a set of handicaps" \ " computed from {D} and {classD} according to the \"-opf\"" \ " parameter. If {NS_REFINE} is positive it also generates images of the 1NN classifier with" \ " the model set {M'} after refinement ({TAG} = \"M-ref\"). In these images, each class domain will" \ " be painted with a distinct nonwhite color, with brightness" \ " increasing from 1 to {NC}. The background (class 0) is painted white.\n" \ "\n" \ " For each of the above 1NN classifiers, the program will also produce an" \ " image file called \"{PREFIX}-{TAG}-D.ppm\"" \ " showing the discrepancy between its classification and that of the" \ " deal classifier {PROBLEM,NAR,NC}. Points where the two classifiers agree, or where" \ " the ideal classifier returns 0, are shown in white; points where they" \ " disagree are shown in red.\n" \ "\n" \ " The number of relevant attributes {NAR} must be 2, and the X and Y axis are" \ " attribute 0 and attribute 1, respectively. Each image will span the augmented" \ " square {V^2}. In all these images, if \"-noise\" is requested, each sample is perturbed by" \ " the noise, and then the position of the original sample is painted with the color" \ " of the domain that contains the perturbed point. If {NAI} is positive, the" \ " irrelevant attributes will set to random values in the set {D}, as explained" \ " above, and to zeros in the samples being imaged. See the \"-image\" option for further details." #define PROG_INFO_OPTIONS \ " -problem {PROBLEM}\n" \ " This mandatory argument specifies the ideal class domains and" \ " the sampling probability distributions.\n" \ "\n" \ " -samples {NS_MODEL} {NS_REFINE} {NS_EVAL}\n" \ " This mandatory argument specifies the number of samples to generate" \ " in each of the three sets. If {NS_REFINE} is zero then no refinement" \ " will be tried.\n" \ "\n" \ " -seed {SEED}\n" \ " This mandatory argument specifies a seed for the random number generator.\n" \ "\n" \ " -prefix {PREFIX}\n" \ " This mandatory argument specifies the prefix for all output filenames.\n" \ "\n" \ " -attributes {NAR} {NAI}\n" \ " This optional argument specifies the number of attributes in" \ " each sample, namely {NAR} relevant ones and {NAI} irrelevant ones. The allowed" \ " values of {NAR} depend on the {PROBLEM}. If {NAR} is zero" \ " (the default) the number of relevant attributes depends on the {PROBLEM}. If {NAI} is zero" \ " (the default) there are no irrelevant attributes.\n" \ "\n" \ " -classes {NC}\n" \ " This optional argument specifies the number of classes into which" \ " the space of all attribute vectors is partitioned. If {NC} is zero" \ " (the default) the number of classes is selected by the {PROBLEM}. The" \ " allowed range of {NC} depends on the {PROBLEM}.\n" \ "\n" \ " -noise {SIGMA}\n" \ " This optional argument specifies the standard deviation" \ " of the Gaussian perturbation to be added to each attribute" \ " after sampling. Note that the noise may move some samples" \ " into the domain of a different class. The default is {SIGMA=0}" \ " (no perturbation).\n" \ "\n" \ " -opf { N | Y }\n" \ " This optional argument requests the use of handicaps" \ " when computing distances to the representantive samples" \ " in the classifer. The handicaps are" \ " computed by the OPF algorithm The default is \"N\" (no OPF handicaps).\n" \ "\n" \ " -refine {REFINE_ITERS}\n" \ " This optional argument directs the program to attempt an iterative" \ " improevment of the model set {M} by swapping some of its elements with" \ " those of the refinement set {R} so as to improve the accuracy of {M} when classifying {R}. At most" \ " {REFINE_ITERS} iterations will be performed. The default" \ " {REFINE_ITERS} depends on the size of {M} and {R}, with the \"full\" option.\n" \ "\n" \ " -swap { same | full }\n" \ " This optional argument specifies whether the refinement iteration should swap only" \ " elements of the same class if possible (\"same\") or may swap " \ " elements of distinct classes may be swapped as well (\"full\"). The default is \"full\".\n" \ "\n" \ " -verify { N | Y }\n" \ " This optional argument specifies whether the procedure should" \ " check that each randomly generated sample belongs to the domain of its nominal" \ " class. Errors detected before the" \ " random perturbation are fatal; errors after the perturbation are" \ " counted and reported to {stderr}. The default is \"N\" (no checking).\n" \ "\n" \ " -grid { N | Y }\n" \ " This optional argument specifies whether the procedure should" \ " generate the evaluation set {E} at a regular grid instead of" \ " randomly. The default is \"Y\" (grid sampling).\n" \ "\n" \ " -image {SIZE} {SUBSMP}\n" \ " This optional argument directs the program to write PPM" \ " image files called \"{PREFIX}-{TAG}-{KIND}.ppm\" showing the domain" \ " classes. This option can be used" \ " only when {NAR=2}. The images will have {SIZE} columns and {SIZE}" \ " rows. The program will generate {SUBSMP^2} samples" \ " inside each pixel, and average their colors to obtain the pixel" \ " color." #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define MAX_ATTRIBS 128 /* Max num of relevant attributes (param safety only) */ #define MAX_CLASSES 128 /* Max num of classes (param safety only) */ #define MAX_SAMPLES 100000000 /* Max num of samples to generate (param safety only) */ #define MAX_SEED (~0LU) /* Max seed value (param safety only) */ #define MAX_NOISE 10.0 /* Max variance of post-sampling noise (param safety only) */ #define MAX_IMAGE_SIZE 1024 /* Max rows and columns in image (param safety only) */ #define MAX_SUBSMP 5 /* Max pixel subsampling order for image (param safety only) */ #define MAX_REFINE_ITERS 100000 /* Max refinement iterations (param safety only) */ /* COMMAND-LINE OPTIONS */ typedef struct options_t { char *problem; /* Name of classification problem. */ int attributes_rel; /* Number of relevant attributes desired. */ int attributes_irr; /* Number of irrelevant attributes to append. */ int classes; /* Number of classes desired. */ int samples_model; /* Number of samples in NN model. */ int samples_refine; /* Number of samples in refinement (retraining) set. */ int samples_eval; /* Number of samples in evaluation set. */ uint32_t seed; /* Seed for randomizer. */ double noise; /* Deviation of post-sampling noise. */ bool_t opf; /* TRUE to use OPF handicaps in 1NN classifier. */ bool_t grid; /* TRUE to use grid sampling to obtain the {E} set. */ bool_t verify; /* TRUE to verify the generated samples. */ int refine_iters; /* Number of refinement iterations. */ bool_t swap_same; /* TRUE if refinement should swap only in same class. */ char *prefix; /* The ouput file prefix. */ /* Image output options: */ int image_size; /* Height and width of PPM image, or 0 if none. */ int image_subsmp; /* Will generate {subsmp*subsmp} samples in each pixel. */ } options_t; /* CANNED PROBLEMS */ typedef enum { problem_kind_SATURN, /* The "saturn" ("B9") dataset from Papa et al.(2009); {NC = 2}. */ problem_kind_PETALS, /* The "petals" ("B10") dataset from Papa et al.(2009); {NC = 4}. */ problem_kind_VESSEL, /* The "boat" ("B11") dataset from Papa et al.(2009); {NC = 3}. */ problem_kind_MBALLS, /* Each class {2..NC} comprises {(NC-1)^(NAR-1)} balls; class 1 is background. */ problem_kind_SHELLS, /* Classes {2..NC} are concentric shells; class 1 is background and center. */ problem_kind_STAOLI, /* Classes {2..3} are a fat and thin box; class 1 is background. */ problem_kind_NUMBER /* Number of valid problem kinds. Must be last. */ } problem_kind_t; /* Numeric problem kinds. The valid kinds are {0..problem_kind_NUMBER-1} */ char *problem_kind_name[problem_kind_NUMBER] = { [problem_kind_SATURN] = "saturn", [problem_kind_PETALS] = "petals", [problem_kind_VESSEL] = "vessel", [problem_kind_MBALLS] = "mballs", [problem_kind_SHELLS] = "shells", [problem_kind_STAOLI] = "staoli" }; /* External names of the problem kinds. */ typedef void problem_def_proc_t(int *NAR, int *NC); /* Type of a problem definition procedure. It sets {*NC} and {*NAR} to suitable defaults if any of them are zero, then performs range checking. */ /* INTERNAL PROTOTYPES */ typedef rn_classif_dataset_t dataset_t; typedef rn_classif_problem_t problem_t; options_t *parse_options(int argc, char **argv); /* Parses the command line arguments and packs them as an {options_t}. */ int main(int argc,char** argv); problem_t *get_problem(char *name, int NAR, int NC); /* Returns a {problem_t} given the problems {name} and the desired numbers of attributes {NAR} and classes {NC}. Fails if {NAR,NC} are not a valid combination for that problem. */ void generate_dataset(problem_t *P, int NAI, int NS, bool_t grid, options_t *o, dataset_t **DP, int **classDP); /* Returns a dataset containing {NS} samples of the classification problem {P}, augmented with {NAI} irrelevant attributes, according to options {o}. */ void generate_raw_dataset_random(problem_t *P, int NA, int NS, uint32_t seed, bool_t verify, dataset_t **DP, int **classDP); void generate_raw_dataset_grid(problem_t *P, int NA, int NS, uint32_t seed, dataset_t **DP, int **classDP); /* Returns a {dataset_t} containing random or grid samples of the classification problem {P} according to options {o}. Only the {P->NAR} relevant attributes of each sample are set. Does not add any noise. */ void dataset_stats_print(FILE *wr, int NS, int NC, int classD[]); /* Prints statistics on number and percentage of samples per class in the dataset {D}. */ void output_dataset(dataset_t *D, int classD[], int NC, options_t *o); /* Writes the dataset as {NC} files "{o->prefix}-c{NNN}.dat" where {CCC} is each class in 3 digit format. See {rn_classif_dataset_write} for the format. */ void verify_dataset(problem_t *P, dataset_t *D, int classD[], char *nameD); /* Compares the classification {classD} for the samples of {D} with the classification specified by {P.lab} with {P.NA} atributes and {P.NC} classes. Assumes that {classD} has only classes in {0..P.NC} too. If there are discrepancies, prints a cross-classification matrix with the datset {nameD} in the title.*/ void compute_handicaps(dataset_t *M, int classM[], bool_t opf, bool_t verbose, double HM[]); /* Computes the 1NN handicaps for the dataset {M} with classes {classM}. If {opf} is true uses the OPF algorithm, else sets all handicaps to zero. */ int evaluate_nn_classifier(int NC, dataset_t *M, int classM[], double HM[], char *nameM, dataset_t *E, int classE[], char *nameE); /* Uses the 1NN classifier {(M,classM,HM)} to classify each sample in the dataset {E}, ans compares the result with the classification {classE}. Prints a cross-classification table and returns the number of discrepancies {nf}, with the dataset {nameM,nameE} in the title. */ void refine_nn_model(int NC, dataset_t *M, int classM[], double HM[], dataset_t *R, int classR[], options_t *o, bool_t verbose); /* Tries to improve the 1NN classifier {(M,HM,classM)} by swapping some elements with those of {R}. */ void output_ideal_image(problem_t *P, char *tag, options_t *o); /* Writes PPM file "{o->prefix}-{tag}-C.ppm" showing the class domains of the problem {P}. */ void output_dataset_image(problem_t *P, dataset_t *D, int classD[], double HD[], char *tag, options_t *o); /* Writes PPM file "{o->prefix}-{tag}-P.ppm" showing the points of the dataset {D}. !!! Should also write "{o->prefix}-{tag}-H.ppm" showing the handicaps as circles. */ void output_nn_images(problem_t *P, dataset_t *D, int classD[], double HD[], char *tag, options_t *o); /* Writes PPM file "{o->prefix}-{tag}-C.ppm" showing the class domains of the 1NN classifier {(D,classD,HD)}. Then writes another image file "{o->prefix}-{tag}-D.pgm" that shows where that classification differs from the classification of {P}. */ void output_nn_hand_cmp_image(problem_t *P, dataset_t *D, int classD[], double HD[], char *tag, options_t *o); /* Writes PPM file "{o->prefix}-{tag}-X.ppm" showing the effect of the handicaps {HD} on the 1NN classifier; that is, comparing the 1NN classifier {(D,HD,classD)} with {(D,NULL,classD)}. */ void output_image(char *prefix, char *tag, char *kind, uint16_image_t *img); /* Writes the image {img} to file "{prefix}-{tag}-{kind}.ppm". */ void add_random_noise(dataset_t *D, double sigma, uint32_t seed); /* Add the post-sampling noise. The perturbations are a fixed random function of the seed, indepednent of the random values used in sampling. */ void append_irrelevant_attributes(dataset_t *D, problem_t *P, uint32_t seed); /* Appends {D.NA - P.NAR} random values uniform in {U} to each sample in {D}. The values are a fixed random function of the seed, independent of those used in sampling and noise. */ void get_grid_indices(int g, int NG, int NAR, int ix[]); /* The integer {g} must be in the range {0..NG^NAR-1}. Decomposes {g} into its {NAR} digits {ix[0..NAR-1]} in base {NG}. */ /* IMPLEMENTATIONS */ int main(int argc, char** argv) { options_t *o = parse_options(argc, argv); problem_t *P = get_problem(o->problem, o->attributes_rel, o->classes); if (o->image_size > 0) { output_ideal_image(P, "I-tru", o); } /* Generate the model dataset (random): */ dataset_t *M = NULL; int *classM = NULL; assert(o->samples_model > 0); generate_dataset(P, o->attributes_irr, o->samples_model, FALSE, o, &M, &classM); if (o->verify) { verify_dataset(P, M, classM, "M"); } double *HM = notnull(malloc(M->NS*sizeof(double)), "no mem"); compute_handicaps(M, classM, o->opf, FALSE, HM); if (o->image_size > 0) { output_dataset_image(P, M, classM, HM, "M-ini", o); output_nn_images(P, M, classM, HM, "M-ini", o); if (o->opf) { output_nn_hand_cmp_image(P, M, classM, HM, "M-ini", o); } } /* Generate the evaluation dataset (grid or random): */ dataset_t *E = NULL; int *classE = NULL; if (o->samples_eval > 0) { generate_dataset(P, o->attributes_irr, o->samples_eval, o->grid, o, &E, &classE); if (o->verify) { verify_dataset(P, E, classE, "E"); } evaluate_nn_classifier(P->NC, M, classM, HM, "M-ini", E, classE, "E"); if (o->image_size > 0) { output_dataset_image(P, E, classE, NULL, "E-ini", o); } } /* Output images of the ideal and the unrefined 1NN classifiers: */ if (o->samples_refine > 0) { /* Generate the refinement dataset (random): */ dataset_t *R; int *classR; generate_dataset(P, o->attributes_irr, o->samples_refine, FALSE, o, &R, &classR); if (o->verify) { verify_dataset(P, R, classR, "R"); } if (o->image_size > 0) { output_dataset_image(P, R, classR, NULL, "R-ini", o); } /* Refine model and output images of the result: */ refine_nn_model(P->NC, M, classM, HM, R, classR, o, TRUE); if (E != NULL) { evaluate_nn_classifier(P->NC, M, classM, HM, "M-ref", E, classE, "E"); } /* Output images of the refined 1NN classifier: */ if (o->image_size > 0) { output_nn_images(P, M, classM, HM, "M-ref", o); output_dataset_image(P, M, classM, HM, "M-ref", o); if (o->opf) { output_nn_hand_cmp_image(P, M, classM, HM, "M-ref", o); } } } return 0; } void compute_handicaps(dataset_t *M, int classM[], bool_t opf, bool_t verbose, double HM[]) { if (opf) { rn_classif_opf_compute_handicaps(M, classM, rn_dist, HM, verbose); } else { int i; for (i = 0; i < M->NS; i++) { HM[i] = 0; } } } void generate_dataset(problem_t *P, int NAI, int NS, bool_t grid, options_t *o, dataset_t **DP, int **classDP) { int NAR = P->NA; int NA = NAR + NAI; if (grid) { generate_raw_dataset_grid(P, NA, NS, o->seed, DP, classDP); } else { generate_raw_dataset_random(P, NA, NS, o->seed, o->verify, DP, classDP); } dataset_t *D = (*DP); int *classD = (*classDP); dataset_stats_print(stderr, D->NS, P->NC, classD); append_irrelevant_attributes(D, P, o->seed); if (o->noise > 0) { add_random_noise(D, o->noise, o->seed); } } void dataset_stats_print(FILE *wr, int NS, int NC, int classD[]) { int *num = notnull(malloc((NC+1)*sizeof(int)), "no mem"); rn_classif_class_count(NS, classD, NC, num); rn_classif_class_count_print(wr, NC, num); free(num); } void output_dataset(dataset_t *D, int classD[], int NC, options_t *o) { int cl; for (cl = 1; cl <= NC; cl++) { char *fname = NULL; char *fname = jsprintf("%s-c%03d.dat", o->prefix, cl); FILE *wr = open_write(fname, TRUE); rn_classif_dataset_write(wr, D, cl, classD); fclose(wr); free(fname); } } void verify_dataset(problem_t *P, dataset_t *D, int classD[], char *nameD) { int *classDX = notnull(malloc(D->NS*sizeof(int)), "no mem"); rn_classif_dataset_label(D, P->NA, P->NC, P->lab, classDX); int ns, nf; rn_classif_compare(D->NS, classD, classDX, &ns, &nf); if (nf != 0) { fprintf(stderr, "\n"); fprintf(stderr, "=== cross-classification of ideal vs 1NN(%s) on set %s ===\n", nameD, nameD); int *ncc; rn_classif_cross_matrix_build(D->NS, classD, P->NC, classDX, P->NC, &ncc); rn_classif_cross_matrix_print(stderr, P->NC, P->NC, ncc, TRUE); free(ncc); fprintf(stderr, "\n"); } free(classDX); } int evaluate_nn_classifier(int NC, dataset_t *M, int classM[], double HM[], char *nameM, dataset_t *E, int classE[], char *nameE) { int *classEX = notnull(malloc(E->NS*sizeof(int)), "no mem"); rn_classif_nn_label_dataset(M, HM, classM, E, rn_dist, classEX); fprintf(stderr, "=== cross-classification of 1-NN(%s) on evaluation set %s ===\n", nameM, nameE); int *ncc; rn_classif_cross_matrix_build(E->NS, classE, NC, classEX, NC, &ncc); rn_classif_cross_matrix_print(stderr, NC, NC, ncc, TRUE); free(ncc); fprintf(stderr, "\n"); int ns, nf; rn_classif_compare(E->NS, classE, classEX, &ns, &nf); free(classEX); return nf; } void refine_nn_model(int NC, dataset_t *M, int classM[], double HM[], dataset_t *R, int classR[], options_t *o, bool_t verbose) { auto void hproc(dataset_t *D, int classD[], double HD[]); int *classRX = notnull(malloc(R->NS*sizeof(int)), "no mem"); int maxIters = o->refine_iters; if (maxIters <= 0) { maxIters = (int)imin(3*M->NS, 3*R->NS); } rn_classif_nn_improve(M, HM, classM, R, classR, classRX, rn_dist, hproc, maxIters, o->swap_same, verbose); /* Local procedure implementation: */ void hproc(dataset_t *D, int classD[], double HD[]) { compute_handicaps(D, classD, o->opf, FALSE, HD); } } void output_ideal_image(problem_t *P, char *tag, options_t *o) { r2_t ctr = (r2_t){{ 0,0, }}; /* Center of imaged area. */ double HV = 1.0 + 4*o->noise; /* Half-extent of imaged area. */ srandom(o->seed + 314159); fprintf(stderr, "computing the ideal classifier image '%s-C' ...\n", tag); uint16_image_t *imgC = rn_classif_uint16_image ( P->NA, P->NC, P->lab, 0, NULL, 0, NULL, NULL, NULL, &ctr, HV, o->image_size, o->image_subsmp, o->noise ); output_image(o->prefix, tag, "C", imgC); uint16_image_free(imgC); } void output_dataset_image(problem_t *P, dataset_t *D, int classD[], double HD[], char *tag, options_t *o) { r2_t ctr = (r2_t){{ 0,0, }}; /* Center of imaged area. */ double HV = 1.0 + 4*o->noise; /* Half-extent of imaged area. */ srandom(o->seed + 314159); fprintf(stderr, "computing the dataset image '%s-P' ...\n", tag); uint16_image_t *imgC = rn_classif_uint16_image ( P->NA, 0, NULL, P->NC, P->lab, P->NC, D, HD, classD, &ctr, HV, o->image_size, o->image_subsmp, o->noise ); output_image(o->prefix, tag, "P", imgC); uint16_image_free(imgC); } void output_nn_images(problem_t *P, dataset_t *D, int classD[], double HD[], char *tag, options_t *o) { auto int evalDx(int NAX, int NCX, double px[]); /* The 1NN classifier {(D,classD,HD)}, with {px} already expanded from {NA} attributes to {D.NA} attributes. Requires {NCX==P.NC}, {NAX==D.NA}; assumes that {classD} ranges over {0..P.NC}. */ auto int evalD(int NAX, int NCX, double p[]); /* The 1NN classifier {(D,classD,HD)}, that expands {p} with random {U} values from {NA} attributes to {D.NA} attributes. Requires {NCX==P.NC}, {NAX==P.NA} and {P.NA <= D.NA}; assumes that {classD} ranges over {0..P.NC}. */ auto int diffPD(int NAX, int NCX, double p[]); /* A labeler that compares the classifier {P} on {p[0..NA-1} with the 1NN {evalD} classifier above. If {P} assigns class 0, returns 0. Otherwise, if the two classes agree, returns 2. Otherwise returns 1. Requires {NCX==2} and {NAX==P.NA}. */ r2_t ctr = (r2_t){{ 0,0, }}; /* Center of imaged area. */ double HV = 1.0 + 4*o->noise; /* Half-extent of imaged area. */ fprintf(stderr, "computing the 1NN classifier image '%s-C' ...\n", tag); srandom(o->seed + 314159); uint16_image_t *imgC = rn_classif_uint16_image ( P->NA, P->NC, evalD, P->NC, P->lab, P->NC, D, HD, classD, &ctr, HV, o->image_size, o->image_subsmp, o->noise ); output_image(o->prefix, tag, "C", imgC); uint16_image_free(imgC); fprintf(stderr, "computing the ideal/1NN classifier discrepancy image '%s-D' ...\n", tag); srandom(o->seed + 314159); uint16_image_t *imgD = rn_classif_uint16_image ( P->NA, 2, diffPD, 0, NULL, 0, D, HD, NULL, &ctr, HV, o->image_size, o->image_subsmp, o->noise ); output_image(o->prefix, tag, "D", imgD); uint16_image_free(imgD); int evalDx(int NAX, int NCX, double px[]) { assert(NAX == D->NA); int imin = rn_classif_find_nearest_in_dataset(D, HD, D->NA, px, rn_dist); assert((imin >= 0) && (imin <= D->NS)); int cl = classD[imin]; assert((cl >= 0) && (cl <= P->NC)); return cl; } int evalD(int NAX, int NCX, double p[]) { assert(NAX == P->NA); assert(NAX <= D->NA); /* Expand {p[0..NA-1]} to {px[0..D.NA-1]} by appending random values: */ double px[D->NA]; int t; for (t = 0; t < D->NA; t++) { px[t] = (t < D->NA ? p[t] : 2*drandom() - 1); } /* Classify {px} with {(D,HD,classD)}: */ return evalDx(D->NA, NCX, px); } int diffPD(int NAX, int NCX, double p[]) { assert(NAX == P->NA); assert(NCX == 2); int clP = P->lab(NAX, P->NC, p); if (clP == 0) { return 0;} else { int clD = evalD(NAX, P->NC, p); return (clD == clP ? 2 : 1); } } } void output_nn_hand_cmp_image(problem_t *P, dataset_t *D, int classD[], double HD[], char *tag, options_t *o) { auto int evalDx(int NAX, int NCX, double HX[], double px[]); /* The 1NN classifier {(D,classD,HD)}, with {px} already expanded from {NA} attributes to {D.NA} attributes. The handicap tbale HX may be NULL. Requires {NCX==P.NC}, {NAX==D.NA}; assumes that {classD} ranges over {0..P.NC}. */ auto int diffH(int NAX, int NCX, double p[]); /* A labeler that compares the 1NN classifier {(D,HD,classD)} with {(D,NULL,classD)}. If {P} returns 0, returns 0. Otherwise, if the handicaps were an iprovement, returns 3. If they were a disimprovement, returns 1. If they made no difference, returns 2. If {Requires {NCX==3} and {NAX==P.NA}. */ r2_t ctr = (r2_t){{ 0,0, }}; /* Center of imaged area. */ double HV = 1.0 + 4*o->noise; /* Half-extent of imaged area. */ srandom(o->seed + 314159); fprintf(stderr, "computing the 1NN classifier with/sans H discrepancy image '%s-G' ...\n", tag); srandom(o->seed + 314159); uint16_image_t *imgG = rn_classif_uint16_image ( P->NA, 3, diffH, P->NC, P->lab, 0, D, HD, NULL, &ctr, HV, o->image_size, o->image_subsmp, o->noise ); output_image(o->prefix, tag, "G", imgG); uint16_image_free(imgG); int evalDx(int NAX, int NCX, double HX[], double px[]) { assert(NAX == D->NA); int imin = rn_classif_find_nearest_in_dataset(D, HX, D->NA, px, rn_dist); assert((imin >= 0) && (imin <= D->NS)); int cl = classD[imin]; assert((cl >= 0) && (cl <= P->NC)); return cl; } int diffH(int NAX, int NCX, double p[]) { assert(NAX == P->NA); assert(NCX == 3); int clP = P->lab(NAX, P->NC, p); if (clP == 0) { return 0;} else { /* Expand {p[0..NA-1]} to {px[0..D.NA-1]} by appending random {U} values: */ double px[D->NA]; int t; for (t = 0; t < D->NA; t++) { px[t] = (t < D->NA ? p[t] : 2*drandom() - 1); } /* Classify {px} with and without {HD}: */ int cl_with_H = evalDx(D->NA, P->NC, HD, px); int cl_sans_H = evalDx(D->NA, P->NC, NULL, px); /* Compare classifications: */ if (cl_with_H == cl_sans_H) { if (cl_with_H == clP) { /* All three agree: */ return 2; } else { /* With/sans agree but both are wrong: */ return 2; } } else { if (cl_with_H == clP) { /* Handicaps improved: */ return 3; } else if (cl_sans_H == clP) { /* Handicaps made it worse: */ return 1; } else { /* Handicap changed but did not improve: */ return 2; } } } } } void output_image(char *prefix, char *tag, char *kind, uint16_image_t *img) { char *fname = jsprintf("%s-%s-%s.%s", prefix, tag, kind, (img->chns == 1 ? "pgm" : "ppm")); uint16_image_write_pnm_named(fname, img, FALSE, TRUE); free(fname); } void append_irrelevant_attributes(dataset_t *D, problem_t *P, uint32_t seed) { int NS = D->NS; int NAR = P->NA; int NA = D->NA; /* Append the irrelevant attributes {NAR..NA-1}: */ /* Must do this separately so that the essential ones are fixed for a given {seed}. */ srandom(seed + 4615); int i, t; for (i = 0; i < NS; i++) { double *attri = D->smp[i]; for (t = NAR; t < NA; t++) { attri[t] = 2*drandom() - 1; } } } void add_random_noise(dataset_t *D, double sigma, uint32_t seed) { int NS = D->NS; int NA = D->NA; srandom(seed + 19501129); int i, t; for (i = 0; i < NS; i++) { double *attri = D->smp[i]; for (t = 0; t < NA; t++) { /* Add to {attri[t]} a Gaussian deviate truncated to {4*sigma} */ double dit = sigma*fmax(-4.0, fmin(+4.0, dgaussrand())); attri[t] += dit; } } } void generate_raw_dataset_random(problem_t *P, int NA, int NS, uint32_t seed, bool_t verify, dataset_t **DP, int **classDP) { int NC = P->NC; int NAR = P->NA; dataset_t *D = rn_classif_dataset_new(NS, NA); int *classD = notnull(malloc(NS*sizeof(int)), "no mem"); /* Generate samples: */ srandom(seed); int nerr = 0; int i; for (i = 0; i < NS; i++) { double *attri = notnull(malloc(NA*sizeof(double)), "no mem"); D->smp[i] = attri; P->gen(i, NAR, NC, attri, &(classD[i])); if (verify) { int ver = P->lab(NAR, NC, attri); if (classD[i] != ver) { /* Since sample is unperturbed,this is a fatal error or very bad luck: */ fprintf(stderr, "** inconsistent class %d != %d for sample %d:\n", classD[i], ver, i); rn_classif_sample_print(stderr, " ", NAR, attri, " ", "\n"); nerr++; if (nerr > 100) { fprintf(stderr, "** too many errors\n"); exit(1); } } } } (*DP) = D; (*classDP) = classD; } void generate_raw_dataset_grid(problem_t *P, int NA, int NS, uint32_t seed, dataset_t **DP, int **classP) { int NC = P->NC; int NAR = P->NA; /* Compute the grid order {NG} such that {NG^NAR >= NS}: */ int NG = (int)ceil(pow(NS, 1.0/NAR)); int NT = (int)ipow(NG, NAR); /* Total number of tentative samples. */ assert(NT >= NS); demand(NT <= MAX_SAMPLES, "too many samples in grid"); /* Allocate sample array with maximum size: */ int *classD = notnull(malloc(NT*sizeof(int)), "no mem"); dataset_t *D = rn_classif_dataset_new(NT, NA); /* Each sample is identified by a tuple of {NAR} indices in {0..NG-1}: */ int ix[NAR]; /* Generate grid samples and store those that have nonzero class. */ /* Those samples are stored in postions {0..NOK-1} of {attr} and {classD}. */ NS = 0; srandom(seed); double attr[NA]; /* Temporary attribute vector. */ int g; for (g = 0; g < NT; g++) { /* Break {g} down into {NAR} grid indices: */ get_grid_indices(g, NG, NAR, ix); /* Convert the indices into attributes in {U}: */ int t; for (t = 0; t < NAR; t++) { attr[t] = 2*(ix[t] + 0.5)/NG - 1; } /* Now use the problem's classifier to obtain the classD: */ int cl = P->lab(NAR, NC, attr); if (cl != 0) { /* Sample is inside some domain, store it: */ double *attrn = notnull(malloc(NA*sizeof(double)), "no mem"); for (t = 0; t < NAR; t++) { attrn[t] = attr[t]; } D->smp[NS] = attrn; classD[NS] = cl; NS++; } } /* Trim excess storage and return: */ D->NS = NS; D->smp = notnull(realloc(D->smp, NS*sizeof(double*)), "no mem"); classD = notnull(realloc(classD, NS*sizeof(int)), "no mem"); (*DP) = D; (*classP) = classD; } void get_grid_indices(int g, int NG, int NAR, int ix[]) { int k; int tmp = g; for (k = 0; k < NAR; k++) { ix[k] = tmp % NG; tmp = tmp/NG; } } problem_t *get_problem(char *name, int NAR, int NC) { problem_kind_t kind; for (kind = 0; kind < problem_kind_NUMBER; kind++) { if (strcmp(name, problem_kind_name[kind]) == 0) { /* Found the problem: */ /* Get the problem-specific procedure {def}: */ problem_def_proc_t *chk = NULL; rn_classif_labeler_t *lab = NULL; rn_classif_thrower_t *gen = NULL; switch(kind) { case problem_kind_SATURN: chk = &rn_classif_test_check_saturn; lab = &rn_classif_test_label_saturn; gen = &rn_classif_test_throw_saturn; break; case problem_kind_PETALS: chk = &rn_classif_test_check_petals; lab = &rn_classif_test_label_petals; gen = &rn_classif_test_throw_petals; break; case problem_kind_VESSEL: chk = &rn_classif_test_check_vessel; lab = &rn_classif_test_label_vessel; gen = &rn_classif_test_throw_vessel; break; case problem_kind_MBALLS: chk = &rn_classif_test_check_mballs; lab = &rn_classif_test_label_mballs; gen = &rn_classif_test_throw_mballs; break; case problem_kind_SHELLS: chk = &rn_classif_test_check_shells; lab = &rn_classif_test_label_shells; gen = &rn_classif_test_throw_shells; break; case problem_kind_STAOLI: chk = &rn_classif_test_check_staoli; lab = &rn_classif_test_label_staoli; gen = &rn_classif_test_throw_staoli; break; default: assert(FALSE); } problem_t *P = notnull(malloc(sizeof(problem_t)), "no mem"); (*P) = (problem_t) { .NA = NAR, .NC = NC, .lab = lab, .gen = gen }; chk(&(P->NA), &(P->NC)); return P; } } demand(FALSE, "unrecognized problem kind"); } options_t *parse_options(int argc, char **argv) { /* Initialize argument parser: */ argparser_t *pp = argparser_new(stderr, argc, argv); argparser_set_help(pp, PROG_NAME " version " PROG_VERS ", usage:\n" PROG_HELP); argparser_set_info(pp, PROG_INFO); argparser_process_help_info_options(pp); /* Allocate the command line argument record: */ options_t *o = (options_t *)malloc(sizeof(options_t)); /* Parse keyword parameters: */ argparser_get_keyword(pp, "-problem"); o->problem = argparser_get_next(pp); argparser_get_keyword(pp, "-samples"); o->samples_model = (int)argparser_get_next_int(pp, 1, MAX_SAMPLES); o->samples_refine = (int)argparser_get_next_int(pp, 0, MAX_SAMPLES); o->samples_eval = (int)argparser_get_next_int(pp, 0, MAX_SAMPLES); argparser_get_keyword(pp, "-seed"); o->seed = (uint32_t)argparser_get_next_uint(pp, 1, MAX_SEED); argparser_get_keyword(pp, "-prefix"); o->prefix = argparser_get_next(pp); if (argparser_keyword_present(pp, "-attributes")) { o->attributes_rel = (int)argparser_get_next_int(pp, 0, MAX_ATTRIBS); o->attributes_irr = (int)argparser_get_next_int(pp, 0, MAX_ATTRIBS - o->attributes_rel); } else { o->attributes_rel = 0; o->attributes_irr = 0; } if (argparser_keyword_present(pp, "-classes")) { o->classes = (int)argparser_get_next_int(pp, 0, MAX_CLASSES); } else { o->classes = 0; } if (argparser_keyword_present(pp, "-noise")) { o->noise = argparser_get_next_double(pp, 0, MAX_NOISE); } else { o->noise = 0; } if (argparser_keyword_present(pp, "-verify")) { o->verify = argparser_get_next_bool(pp); } else { o->verify = FALSE; } if (argparser_keyword_present(pp, "-opf")) { o->opf = argparser_get_next_bool(pp); } else { o->opf = FALSE; } if (argparser_keyword_present(pp, "-grid")) { o->grid = argparser_get_next_bool(pp); } else { o->grid = FALSE; } if (argparser_keyword_present(pp, "-refine")) { o->refine_iters = (int)argparser_get_next_int(pp, 0, MAX_REFINE_ITERS); } else { o->refine_iters = 0; } if (argparser_keyword_present(pp, "-swap")) { if (argparser_keyword_present_next(pp, "same")) { o->swap_same = TRUE; } else if (argparser_keyword_present_next(pp, "full")) { o->swap_same = FALSE; } else { argparser_error(pp, "expecting \"same\" or \"full\""); } } else { o->swap_same = FALSE; } if (argparser_keyword_present(pp, "-image")) { o->image_size = (int)argparser_get_next_int(pp, 0, MAX_IMAGE_SIZE); o->image_subsmp = (int)argparser_get_next_int(pp, 0, MAX_SUBSMP); } else { o->image_size = 0; o->image_subsmp = 0; } /* Parse positional arguments: */ argparser_skip_parsed(pp); /* Check for spurious arguments: */ argparser_finish(pp); return o; }