{ "metadata": { "name": "" }, "nbformat": 3, "nbformat_minor": 0, "worksheets": [ { "cells": [ { "cell_type": "heading", "level": 1, "metadata": {}, "source": [ "Exercise 2" ] }, { "cell_type": "code", "collapsed": false, "input": [ "\n", "import numpy as np\n", "from sklearn.decomposition import PCA\n", "from sklearn.neighbors import KNeighborsClassifier" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 21 }, { "cell_type": "markdown", "metadata": {}, "source": [ "A function to read the data from a zip file (in url) \n", " and return the X array - data, and the y vector of classes" ] }, { "cell_type": "code", "collapsed": false, "input": [ "def le_dados(x):\n", " import zipfile\n", " import urllib2\n", " import cStringIO\n", " \n", " def digito(fn):\n", " return fn[fn.find('/')+1]\n", " \n", " a=urllib2.urlopen(x)\n", " # le o zip file in memory!!\n", " mem=cStringIO.StringIO(a.read())\n", " zf=zipfile.ZipFile(mem)\n", " files=zf.namelist()[1:]\n", " n=len(files)\n", " aux=zf.read(files[0]).split()[4:]\n", " x=np.empty((n,len(aux)),dtype=np.int)\n", " y=np.empty(n,dtype=np.int)\n", " \n", " for i in range(len(files)):\n", " f=files[i]\n", " y[i] = int(digito(f))\n", " x[i]=map(int,zf.read(f).split()[4:])\n", " \n", " return (x,y)\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 22 }, { "cell_type": "code", "collapsed": false, "input": [ "X,Y=le_dados(\"http://www.ic.unicamp.br/~wainer/cursos/1s2014/train17.zip\")" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 23 }, { "cell_type": "code", "collapsed": false, "input": [ "Y" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 24, "text": [ "array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,\n", " 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 7,\n", " 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,\n", " 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,\n", " 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,\n", " 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,\n", " 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,\n", " 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,\n", " 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7])" ] } ], "prompt_number": 24 }, { "cell_type": "code", "collapsed": false, "input": [ "X[0:10,10:30]" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 25, "text": [ "array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])" ] } ], "prompt_number": 25 }, { "cell_type": "markdown", "metadata": {}, "source": [ "I dont know if this is ok or not - there are only 0 there. Let us see one of the entries. First let turn one the online display." ] }, { "cell_type": "code", "collapsed": false, "input": [ "%matplotlib inline " ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 26 }, { "cell_type": "markdown", "metadata": {}, "source": [ "pyplot can show a 2D array as an image. But we must conver a line of X into a 2D array." ] }, { "cell_type": "code", "collapsed": false, "input": [ "import matplotlib.pyplot as plt" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 27 }, { "cell_type": "code", "collapsed": false, "input": [ "plt.imshow(X[30,:].reshape((64,64)), cmap=plt.cm.gray)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 28, "text": [ "" ] }, { "metadata": {}, "output_type": "display_data", "png": "iVBORw0KGgoAAAANSUhEUgAAAP0AAAD+CAYAAADxoQNSAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAGbxJREFUeJzt3V9sU+f9P/C3qe2A49QmJDG2w3DBDsSEOEnToVSTWGsc\npnWkQVRZqwlFlFbTpklrb7b2atIuwKwXFdW4mthkdVI7rrqooqjQhq6FtIw/cUQTKQmEEGzHJAR7\nOIHYDud3wc/nS6AhTmL7xH7eL+mIxsexP3XyzvnznOdzVJIkSSAiYaxQugAiyi2GnkgwDD2RYBh6\nIsEw9ESCYeiJBLOk0J84cQKbN2+Gw+HAoUOHMlUTEWWRarHj9DMzM9i0aRNOnToFq9WK5557Dh99\n9BGqq6szXSMRZdCit/Tnzp2D3W6HzWaDRqPBq6++in//+9+ZrI2IskC92G8MBAJYt26d/HVlZSW+\n++67Wc9RqVSLr4yIlmSunfhFb+kZaKL8tOjQW61WjIyMyF+PjIygsrIyI0URURZJi5RIJKQNGzZI\nQ0ND0vT0tORyuaTe3t5ZzwHAhQsXhZa5LPqYXq1W469//St27tyJmZkZ7N+/n2fuifLAoofs0npx\nHvcTKSbjJ/KIKD8x9ESCYeiJBMPQEwmGoScSDENPJBiGnkgwDD2RYBh6IsEw9ESCYeiJBMPQEwmG\noScSDENPJBiGnkgwDD2RYBh6IsEw9ESCYeiJBMPQEwmGoScSDENPJBiGnkgwDD2RYBh6IsEw9ESC\nmTf0r7/+OkwmE7Zu3So/NjExAY/Hg6qqKjQ3NyMSiWS1SCLKnHlDv2/fPpw4cWLWY16vFx6PB/39\n/XC73fB6vVkrkIgyLJ3bUg8NDUk1NTXy15s2bZJGR0clSZKkUCgkbdq06Qe/D8vgdr1cuIi6zGVR\nx/ThcBgmkwkAYDKZEA6HF/MyRKSAJZ/IU6lUvCU1UR5ZVOhNJhNGR0cBAKFQCBUVFRktioiyZ1Gh\nb2lpgc/nAwD4fD60trZmtCgiyh7V/z/hNqfXXnsNX331FcbHx2EymfDnP/8ZL7/8Mtra2nD9+nXY\nbDYcO3YMRqPx8Rfnbn/BUalU0Gg08pLuz1iSJCQSCXmZ59eOMmCuz3je0C8FQ194ioqKYDabYbFY\nYLFYoNVq0/q+6elpBINBhEIhBINBxOPxLFdKc0VbneM6KM9pNBpYrVa4XC64XC7odLq0vi8Wi6Gn\npwcAMDY2xtAriKGnBdFqtbBYLHC5XHC73T94WPdDbt++DQC4efMm+vr6slkizYOhpwVRqVQoKiqC\nXq9HaWkpVq9endb3qdVq2Gw2OJ1ORCIRhEIhRKNRRCIRTE5OZrlqehhDTzmh0WjkPQSdTofBwUF5\nYehzi6GnnEgdFuh0Oqxfvx7l5eUAHhzfBwIBhasTC0NP81Kr1dBoNFCr1TAajdDpdNBqtQsanVGr\n1SgrK0NZWRkAIJlMYmRkBHq9Pltl0xwYepqX0WiUh+nWr18Pl8sFi8UCjUajdGm0CAw9zctoNMLh\ncMDlcqG6uloeo2fo8xNDT/NKhb6pqQnPPvsstFqtfEUe5R+GnualVquh0+lgMBhQWlq6qNdIJpPy\nEF00GkVvby8CgQBisViGq6X5MPSUlqVerZ1IJBAMBjEwMIDBwUH532g0mqEKKV0MPeVEPB5HIBCA\n3+9HV1cXQqGQvNWn3GLoCcDsYTm1evavRUlJCVatWvXY4wuRTCYxPj6OwcFBnD9/Xr4sl3KPoScA\ns4fl1qxZM2ud3W6H3W6HwWBQqDrKJIaeAMweltuwYcOsdWVlZbBarWlPrqHljaEnALOH5RoaGmat\nU6vV8jAd5T+GngDMHpZ7dPeeCgtva0UytrASA0NPJBiGnkgwDD3J2MhUDAw9yXhMLwaGnkgwDD3J\nuHsvBoaeZNy9FwNDTySYeUM/MjKCF154AVu2bEFNTQ0++OADAMDExAQ8Hg+qqqrQ3NyMSCSS9WKJ\naOnmDb1Go8H777+P77//Ht9++y2OHDmCvr4+eL1eeDwe9Pf3w+12w+v15qJeyiIe04th3tCvXbsW\ndXV1AAC9Xo/q6moEAgF0dHSgvb0dANDe3o5PPvkku5VS1vGYXgwLOqa/du0aLl26hG3btiEcDsNk\nMgEATCYTwuFwVgokosxKO/SxWAx79uzB4cOHUVJSMmudSqXirmEB4M9QDGmFPpFIYM+ePdi7dy9a\nW1sBPNi6j46OAgBCoRAqKiqyVyXlBHfvxTBv6CVJwv79++F0OvHWW2/Jj7e0tMDn8wEAfD6f/MeA\niJa3eZtonDlzBv/85z9RW1uL+vp6AMDBgwfxzjvvoK2tDUePHoXNZsOxY8eyXizll2QyiUQigWQy\niUgkgqmpKcTjce5RKGze0P/kJz/B/fv3f3DdqVOnMl4QKSfTx/Sp+9AHg0EMDw/D7/cjGAwikUhk\n9H1oYdgui2SZ3gJHIhEMDAzA7/ejr68PwWCQoV8GGHrKmlTou7q6cOHCBcTjcSQSCYZeYQy9oNRq\nNQwGA4xGIwwGA5xOJ6xWa0bvF59MJnH37l1Eo1FMTExk7HVpaRh6QWk0GlgsFjgcDtjtdvlf3tCi\n8DH0gtJqtbBarXC5XGhqaoLZbJa3+lTYGHpBqdVqlJWVwW63o7GxEatXr1a6JMoRzqcnEgxDTyQY\nhp5IMDymp4xJJpOIRqOIRCKIRqPo7e1FIBBALBZTujR6CENPGZNIJBAMBjEwMIDBwUH532g0qnRp\n9BCGnjImHo8jEAjA7/ejq6sLoVBI3urT8sHQU8Ykk0mMj49jcHAQ58+fx+3bt5UuiX4AT+QRCYah\nJxIMQ08kGB7T05LEYjFEo1FEo1EEAgFcvXoVt27dQjKZVLo0mgNDT0sSiUQwODgoLwMDAwgGgwz9\nMsbQ05KkQt/V1YWenh5EIhFEIhE2yljGGHpaklgshhs3buDy5cs4f/680uVQGngij0gwDD2RYBh6\nIsEw9ESCYeiJBMPQEwnmiaG/d+8etm3bhrq6OjidTrz77rsAgImJCXg8HlRVVaG5uRmRSCQnxdLS\nqNVqrFq1CiUlJTAajdDpdNBqtbxFtWCeGPqVK1eis7MT3d3d6OnpQWdnJ7755ht4vV54PB709/fD\n7XbD6/Xmql5aAqPRCLvdjueffx5utxsulwsWiwUajUbp0iiH5t291+l0AB40SJiZmcHq1avR0dGB\n9vZ2AEB7ezs++eST7FZJGWE0GuFwOPD8889jx44dDL2g5r0i7/79+2hoaMCVK1fwm9/8Blu2bEE4\nHIbJZAIAmEwmhMPhrBdKS5cKfVNTE5599llotVpoNBqGXjDzhn7FihXo7u5GNBrFzp070dnZOWu9\nSqXiMWGeUKvV0Ol0MBgMKC0tVbocUkjaZ+8NBgNeeuklXLhwASaTCaOjowCAUCiEioqKrBVImZXp\n21FT/nli6MfHx+Uz83fv3sXJkydRX1+PlpYW+Hw+AIDP50Nra2v2KyWijHji7n0oFEJ7ezvu37+P\n+/fvY+/evXC73aivr0dbWxuOHj0Km82GY8eO5apeWgCVSiUfs2s0GpSUlGDVqlVQqxc/uVKSJPke\n84lEAnfu3MHdu3c5fz6PqKQs7u/xWF9ZRUVFMJvNsFgssFgscDqdcLlccLlc2Lhx46Jec3p6GqFQ\nCMFgEMFgEL29vfD7/fD7/bhy5UqG/w9oKeaKNufTFzCNRiPfjtrlcuFHP/oRrFYrjEbjol8zkUjI\nve39fj+uX7+OQCDAC7TyCENfwLRaLSwWC1wuF9xuN8rKyuRhusWKx+MIBoPw+/344osvMD4+jng8\nzk45eYShL2AqlQpFRUXQ6/UoLS2FwWBY8mtKkoTp6WnEYjFMTEzw7jV5iBNuiATD0BMJhqEnEgxD\nTyQYhp5IMAw9kWAYeiLBMPREgmHoiQTD0BMJhqEnEgxDTyQYhp5IMJxlV2D0ej0MBgMMBgOsVis2\nbNiANWvWLKlbTiwWQzQaRTQaRSAQwNWrV3Hr1i12y8lTDH2BSd3QIrU4HA5YLJYlhT4SiWBwcFBe\nBgYGEAwGGfo8xdAXmFTom5qaUFtbC6PRCKPRuKTGGanQd3V1oaenB5FIBJFIhI0z8hRDn+cebX5p\nNpvhcDhQV1eHhoaGjLxHLBbDjRs3cPnyZZw/fz4jr0nKYejznFarfaz5pd1uz0iXHCpMDH2ey0bz\nSypsDH2ey0bzSypsDH2ey0bzSypsvDiHSDAMPZFg0gr9zMwM6uvrsWvXLgDAxMQEPB4Pqqqq0Nzc\nzLubEOWRtEJ/+PBhOJ1O+d50Xq8XHo8H/f39cLvd8Hq9WS2SiDJn3tDfuHEDx48fxxtvvCHfEK+j\nowPt7e0AgPb2dnzyySfZrZKIMmbe0L/99tt47733sGLF/z01HA7DZDIBAEwmE8LhcPYqJKKMemLo\nP/30U1RUVKC+vn7O296qVCrekjrH9Ho9rFYrnE4nGhoaMjKTjsTxxN+Ss2fPoqOjA8ePH8e9e/fw\nv//9D3v37oXJZMLo6CjWrl2LUCiEioqKXNVLyM5MOhLHE7f0Bw4cwMjICIaGhvDxxx/jxRdfxIcf\nfoiWlhb4fD4AgM/nQ2tra06KpQcenknndrtRW1sLs9nMq/AoLQvaNKR249955x20tbXh6NGjsNls\nOHbsWFaKox+m1+tRWVmJmpoaNDY2zvk8SZKQSCTk5amnnoJGo4FareZegcDS/slv374d27dvBwCU\nlpbi1KlTWSuK5pfOeZR4PI5QKIRgMIhgMIinn34aFosFZrMZa9asyUGVtBzxz32emuvE6sMSiQQC\ngQD8fj/8fj/MZjNcLhd0Oh1DLzCGvoDF43EEg0H4/X588cUXsNvtKC4uxvr165UujRTE0OehWCyG\nQCCA3t7eJz4vGo2ir68Pw8PDuHnzJsrLy3H37l32thMcQ5+HUj3rAGB4eBjAg2P8h3f5VSoVpqam\nHmtimc5hARU2hj4PRSIRDAwMYGxsDHq9fs7nJZNJNrGkxzD0eSgWi8nNKokWivPpBcNLpomhFwyP\n6YmhJxIMj+kLmFqthsFggNFohMFggNPphNVqfeLJPyp8DH0B02g0sFgscDgc8mw83giDGPoCptVq\n5RthNDU1wWw2y1t9EhdDX8DUajXKyspgt9vR2NiI1atXK10SLQM8kUckGIaeSDAMPZFgGHoiwTD0\nRIJh6IkEI/SQnUqlgkajkZdcT0ZJJpNIJBJIJpMLamyhVqvTanBpNBqh0+mg1WoX9P/2aEPNO3fu\nsPlGARE69FqtFmazGRaLBRaLBVqtNqfvPz4+jmAwiFAohFu3bqX9fUajUa77Sb3u9Ho9XC4XLBbL\ngtpjP9pQs7e3F4ODg4hGo2m/Bi1fQodeo9HIV6ylGkbm0pUrV+D3+zE1NbXg0DscDrhcLmzYsGHO\n5xUVFcl/0BYS+kcbal6/fh2BQIB3Jy4QQodeq9XCYrHA5XLB7XbDaDTm9P3Pnz+PyclJueVVulKh\nb2pqQkNDw5zPU6lU0Gq18uFLuh5tqDk+Po54PM7uOwVC6NCrVCoUFRVBr9ejtLQ055epWq1WOBwO\njI2NLejmEzU1NbDb7bBarVlpZS1JEqanpxGLxTAxMcHd+gIjdOiVlro9FYAFtaWurKzkbDlaNIZe\nQand9PLycsRisbS/T6/Xc7YcLVpaobfZbHj66afle6GdO3cOExMT+OUvf4nh4WH5fna5PibOd3q9\nXr4v3XKSOhdQXFz82M/00eE8tt/KP2mFXqVS4fTp0ygtLZUf83q98Hg8+MMf/oBDhw7B6/XC6/Vm\nrVDKnYfn4QPA5OSkvG56eloeZgwGg4jH40qVSYuU9u79o3/ROzo68NVXXwEA2tvb8dOf/pShLxCp\njjsAUF5ePivYsVgMPT09AICxsTGGPg+lvaXfsWMHnnrqKfz617/Gm2++iXA4DJPJBAAwmUwIh8NZ\nLZRyJzWUWV5ejurq6ll/8G/fvg0AuHnzJvr6+pQqkZYgrdCfOXMGZrMZY2Nj8Hg82Lx586z1KpWK\n/dQLSGoos6io6LF1arUaNpsNTqcTkUgEd+7ckdel7qgTjUYRjUZ52e4ylVbozWYzgAe7ert378a5\nc+dgMpkwOjqKtWvXIhQKoaKiIquF0vKQ2vVPXcF47949ed3U1BQGBwcxMDCAyclJhn6Zmjf0U1NT\nmJmZQUlJCSYnJ/H555/jT3/6E1paWuDz+fDHP/4RPp8Pra2tuaiXFJba9dfpdFi/fv2sYEejURQX\nF8tXGT78B4GWj3lDHw6HsXv3bgAPdt9+9atfobm5GY2NjWhra8PRo0flIbt8I0kS4vE4JicnH7uu\nXOkZeMtVqtlmWVnZY+tSV+8FAgEMDAwAwKJmEVJ2qaQsDrQu96CkZqHV1dXB5XKhuLhYXpearJKa\nzZbrGXj5KBaLwe/3o7u7G36/H8PDw4uaRUiZMVe0hb4iL5FIIBgMAngw/PRwsPV6PWprawE8OJfB\n0M/v0aG+vr6+Rc0ipOwSOvSp2WRjY2Po6+ubtWeSmnxTUVGB6upqpUrMK48O9RkMhkXNIqTsEjr0\nqdlk09PTj61LJpO4du0aent7YTQaUVJSktVa1Gq1fD29wWBY0Ky75WJmZgaxWEwetgsEArh9+zZP\n6C0z+feblSOpXf/U7unKlSsBPDhPkTpWevi/l7pu1apV8v3miouL8zL0qc9sYGBAHrpjx53lJ/9+\ns3Iktes/NTWF4eHhrIcwtStcXFyM9evXy39k8kk8Hpc77nR1dSEUCslbfVo+GPo5JJNJjI+PY3x8\nPCfvV1paCoPBIDfWyMchromJCVy/fh29vb3473//y/ZayxRDv0w8vJUEMGv4MF+khuyCwSBbay1j\nDP0y8aThw3yRmnbL0C9vQl+cs5w83MRyoX3ql4vUFY6JRALxeJwNNhQ21+fP0BMVqLmizdtaEQmG\noScSDENPJBiGnkgwDD2RYBh6IsEw9ESCYeiJBMPQEwmGoScSDENPJBiGnkgwDD2RYBh6IsEw9ESC\nSSv0kUgEr7zyCqqrq+F0OvHdd99hYmICHo8HVVVVaG5uZj80ojyRVuh///vf4+c//zn6+vrQ09OD\nzZs3w+v1wuPxoL+/H263G16vN9u1ElEmSPOIRCLSM88889jjmzZtkkZHRyVJkqRQKCRt2rTpsecA\n4MKFi0LLXObd0g8NDaG8vBz79u1DQ0MD3nzzTUxOTiIcDsNkMgEATCYTwuHwfC9FRMvAvKFPJpO4\nePEifvvb3+LixYsoLi5+bFdepVKxHx5Rnpg39JWVlaisrMRzzz0HAHjllVdw8eJFrF27FqOjowCA\nUCiEioqK7FZKRBkxb+jXrl2LdevWob+/HwBw6tQpbNmyBbt27YLP5wMA+Hw+tLa2ZrdSIsqItFpg\n+/1+vPHGG4jH49i4cSP+8Y9/YGZmBm1tbbh+/TpsNhuOHTsGo9E4+8W5y0+kGPa9JxIM+94TEQCG\nnkg4DD2RYHjXWqICoNfrYTAYYDAYoNfrce7cuTmfy9ATFQCj0Qi73Q673Q6r1frE0HP3nqgApELf\n1NSEn/3sZ098Lrf0RAVAr9ejsrISNTU1+PGPf/zE53JLT1Qg0r4uZr6ptUuxfft2xacXcuEi4rJ9\n+/Y5c5nVK/KIaPnh7j2RYBh6IsEw9ESCyWroT5w4gc2bN8PhcODQoUPZfKvHvP766zCZTNi6dav8\nmFIdfEdGRvDCCy9gy5YtqKmpwQcffKBIPffu3cO2bdtQV1cHp9OJd999V5E6HjYzM4P6+nrs2rVL\n0VpsNhtqa2tRX18vD3kpVUu2u09nLfQzMzP43e9+hxMnTqC3txcfffQR+vr6svV2j9m3bx9OnDgx\n6zGlOvhqNBq8//77+P777/Htt9/iyJEj6Ovry3k9K1euRGdnJ7q7u9HT04POzk588803inY2Pnz4\nMJxOpzzcpFQtKpUKp0+fxqVLl+Sr2ZSqJevdp7M1XHf27Flp586d8tcHDx6UDh48mK23+0FDQ0NS\nTU2N/HU6HXxz4eWXX5ZOnjypaD2Tk5NSY2OjdPnyZcXqGBkZkdxut/Tll19Kv/jFLyRJUu5nZLPZ\npPHx8VmPKVHLUrpPpytrW/pAIIB169bJX1dWViIQCGTr7dKyHDr4Xrt2DZcuXcK2bdsUqef+/fuo\nq6uDyWSSDzmU+lzefvttvPfee1ix4v9+DZWqRaVSYceOHWhsbMTf/vY3xWrJRffprIV+uXfNUaKD\nbywWw549e3D48GGUlJQoUs+KFSvQ3d2NGzdu4D//+Q86OzsVqePTTz9FRUUF6uvr527rlMOf0Zkz\nZ3Dp0iV89tlnOHLkCL7++mtFaslF9+mshd5qtWJkZET+emRkBJWVldl6u7SYTCbFOvgmEgns2bMH\ne/fulZuIKlmPwWDASy+9hAsXLihSx9mzZ9HR0YFnnnkGr732Gr788kvs3btXsc/EbDYDAMrLy7F7\n926cO3dOkVpy0X06a6FvbGzEwMAArl27hng8jn/9619oaWnJ1tulpaWlRZEOvpIkYf/+/XA6nXjr\nrbcUq2d8fFw+63v37l2cPHkS9fX1inwuBw4cwMjICIaGhvDxxx/jxRdfxIcffqhILVNTU7hz5w4A\nYHJyEp9//jm2bt2qSC056T696LMBaTh+/LhUVVUlbdy4UTpw4EA23+oxr776qmQ2myWNRiNVVlZK\nf//736Vbt25JbrdbcjgcksfjkW7fvp2TWr7++mtJpVJJLpdLqqurk+rq6qTPPvss5/X09PRI9fX1\nksvlkrZu3Sr95S9/kSRJUuxzSTl9+rS0a9cuxWq5evWq5HK5JJfLJW3ZskX+XVXqc+nu7pYaGxul\n2tpaaffu3VIkEsloLbz2nkgwvCKPSDAMPZFgGHoiwTD0RIJh6IkEw9ATCYahJxLM/wNL5CX03bnZ\nUgAAAABJRU5ErkJggg==\n", "text": [ "" ] } ], "prompt_number": 28 }, { "cell_type": "markdown", "metadata": {}, "source": [ "It looks OK. An ugly \"1\" but a \"1\"" ] }, { "cell_type": "code", "collapsed": false, "input": [ "knn = KNeighborsClassifier(n_neighbors=1)\n", "knn.fit(X,Y)" ], "language": "python", "metadata": {}, "outputs": [ { "metadata": {}, "output_type": "pyout", "prompt_number": 29, "text": [ "KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',\n", " n_neighbors=1, p=2, weights='uniform')" ] } ], "prompt_number": 29 }, { "cell_type": "code", "collapsed": false, "input": [ "Xtest,Ytest=le_dados(\"http://www.ic.unicamp.br/~wainer/cursos/1s2014/test17.zip\")" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 30 }, { "cell_type": "code", "collapsed": false, "input": [ "accur=knn.score(Xtest,Ytest)" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 31 }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] }, { "cell_type": "code", "collapsed": false, "input": [ "print(accur)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "0.8125\n" ] } ], "prompt_number": 32 }, { "cell_type": "markdown", "metadata": {}, "source": [ "The test went well. The accuracy was 0.8125. I can now write the loop for all k" ] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "Loop for all values of k" ] }, { "cell_type": "code", "collapsed": false, "input": [ "for k in (1,3,5,11,17,21):\n", " knn = KNeighborsClassifier(n_neighbors=k)\n", " knn.fit(X,Y)\n", " print \"k=\",k,1-knn.score(Xtest,Ytest)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "k= 1 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "0.1875\n", "k=" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 3 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "0.2875\n", "k=" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 5 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "0.3375\n", "k=" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 11 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "0.3875\n", "k=" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 17 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "0.425\n", "k=" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 21 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "0.4125\n" ] } ], "prompt_number": 34 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let us try the PCA now." ] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "PCA of 100" ] }, { "cell_type": "code", "collapsed": false, "input": [ "pca100=PCA(n_components=100)\n", "X100=pca100.fit_transform(X)\n", "Xtest100=pca100.transform(Xtest)\n", "\n" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 36 }, { "cell_type": "code", "collapsed": false, "input": [ "knn = KNeighborsClassifier(n_neighbors=1)\n", "knn.fit(X100,Y)\n", "print 1-knn.score(Xtest100,Ytest)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "0.1875\n" ] } ], "prompt_number": 37 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Odd, the same number! Let us try the loop" ] }, { "cell_type": "code", "collapsed": false, "input": [ "for k in (1,3,5,11,17,21):\n", " knn = KNeighborsClassifier(n_neighbors=k)\n", " knn.fit(X100,Y)\n", " print \"k=\",k,1-knn.score(Xtest100,Ytest)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "k= 1 0.1875\n", "k= 3 0.275\n", "k= 5 0.25\n", "k= 11 0.3125\n", "k= 17 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "0.3\n", "k= 21 0.3\n" ] } ], "prompt_number": 38 }, { "cell_type": "markdown", "metadata": {}, "source": [ "So the first is the same but not the others. It seems ok. The other PCA" ] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "PCA of 40 " ] }, { "cell_type": "code", "collapsed": false, "input": [ "pca40 = PCA(n_components=40)\n", "X40=pca40.fit_transform(X)\n", "Xtest40=pca40.transform(Xtest)\n", "for k in (1,3,5,11,17,21):\n", " knn = KNeighborsClassifier(n_neighbors=k)\n", " knn.fit(X40,Y)\n", " print \"k=\",k,1-knn.score(Xtest40,Ytest)\n" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "k= 1 0.2125\n", "k= 3 0.1875\n", "k= 5 0.2\n", "k= 11 0.25\n", "k= 17 0.2375\n", "k= 21 0.2625\n" ] } ], "prompt_number": 39 }, { "cell_type": "markdown", "metadata": {}, "source": [ "Well, at least is different from the PCA of 100." ] }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Summary for the 1/7 data set" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The best alternatives were:\n", "\n", " * no PCA and k=1, error rate of 0.185\n", " * PCA of 100 and k=1, same error rate\n", " * PCA of 40 and k=3, same error rate\n", "\n", "I am somewhat puzzled tha the same error rate was the same " ] }, { "cell_type": "heading", "level": 2, "metadata": {}, "source": [ "The 4/9 dataset" ] }, { "cell_type": "code", "collapsed": false, "input": [ "X,Y=le_dados(\"http://www.ic.unicamp.br/~wainer/cursos/1s2014/train49.zip\")\n", "Xtest,Ytest=le_dados(\"http://www.ic.unicamp.br/~wainer/cursos/1s2014/test49.zip\")" ], "language": "python", "metadata": {}, "outputs": [], "prompt_number": 44 }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "No PCA" ] }, { "cell_type": "code", "collapsed": false, "input": [ "for k in (1,3,5,11,17,21):\n", " knn = KNeighborsClassifier(n_neighbors=k)\n", " knn.fit(X,Y)\n", " print \"k=\",k,1-knn.score(Xtest,Ytest)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "k= 1 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "0.16\n", "k=" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 3 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "0.22\n", "k=" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 5 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "0.34\n", "k=" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 11 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "0.32\n", "k=" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 17 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "0.38\n", "k=" ] }, { "output_type": "stream", "stream": "stdout", "text": [ " 21 " ] }, { "output_type": "stream", "stream": "stdout", "text": [ "0.34\n" ] } ], "prompt_number": 47 }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "PCA 100" ] }, { "cell_type": "code", "collapsed": false, "input": [ "pca100 = PCA(n_components=100)\n", "X100=pca100.fit_transform(X)\n", "Xtest100=pca100.transform(Xtest)\n", "for k in (1,3,5,11,17,21):\n", " knn = KNeighborsClassifier(n_neighbors=k)\n", " knn.fit(X100,Y)\n", " print \"k=\",k,1-knn.score(Xtest100,Ytest)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "k= 1 0.22\n", "k= 3 0.28\n", "k= 5 0.34\n", "k= 11 0.24\n", "k= 17 0.28\n", "k= 21 0.36\n" ] } ], "prompt_number": 48 }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "PCA 40" ] }, { "cell_type": "code", "collapsed": false, "input": [ "pca40 = PCA(n_components=40)\n", "X40=pca40.fit_transform(X)\n", "Xtest40=pca40.transform(Xtest)\n", "for k in (1,3,5,11,17,21):\n", " knn = KNeighborsClassifier(n_neighbors=k)\n", " knn.fit(X40,Y)\n", " print \"k=\",k,1-knn.score(Xtest40,Ytest)" ], "language": "python", "metadata": {}, "outputs": [ { "output_type": "stream", "stream": "stdout", "text": [ "k= 1 0.12\n", "k= 3 0.16\n", "k= 5 0.18\n", "k= 11 0.26\n", "k= 17 0.24\n", "k= 21 0.32\n" ] } ], "prompt_number": 49 }, { "cell_type": "heading", "level": 3, "metadata": {}, "source": [ "Summary for the 4/9 dataset" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "PCA of 40, k=1, error rate 0.12" ] }, { "cell_type": "code", "collapsed": false, "input": [], "language": "python", "metadata": {}, "outputs": [] } ], "metadata": {} } ] }