Dyadic Tree implementationed discussed in Section 2¶

import csv

# colors
RED=1.0
BLUE=0.0
UNKNOWN=2.0


class Rect:
    '''
    Class for representing rectangles.
    '''
    def __init__(self, j, k, l, m):
        self._j = j
        self._k = k
        self._l = l
        self._m = m

    @property
    def coords(self):
        '''
        Convert from the four integer representation to the more common
        opposing points representation.
        '''
        return ((self._j/(2**self._k), self._l/(2**self._m)),
                ((self._j+1)/(2**self._k), (self._l+1)/(2**self._m)))

    @property
    def left(self):
        '''
        Return a new rectangle that represents the left half of this
        rectange
        '''
        return Rect(2*self._j, self._k+1, self._l, self._m)

    @property
    def right(self):
        '''
        Return a new rectangle that represents the right half of this
        rectange
        '''
        return Rect(2*self._j+1, self._k+1, self._l, self._m)

    @property
    def bottom(self):
        '''
        Return a new rectangle that represents the bottom half of this
        rectange
        '''
        return Rect(self._j, self._k, 2*self._l, self._m+1)

    @property
    def top(self):
        '''
        Return a new rectangle that represents the top half of this
        rectange
        '''
        return Rect(self._j, self._k, 2*self._l+1, self._m+1)


    @property
    def x_index(self):
        '''
        Compute the row index for the memoization data structure.
        '''
        return 2**(self._k+1) + self._j - 1

    @property
    def y_index(self):
        '''
        Compute the column index for the memoization data structure.
        '''
        return 2**(self._m+1) + self._l - 1


    def __str__(self):
        return "(({}, {}), ({}, {}))".format(self._j, self._k, self._l, self._m)


class Tree_Node:
    '''
    Representation for a node in the decision tree.
    '''
    def __init__(self, R, color, cost):
        self._R = R
        self._color = color
        self._cost = cost
        self._left = None
        self._right = None
        self._split_on = "no_split"

    @property
    def R(self):
        return self._R

    @property
    def color(self):
        return self._color

    @color.setter
    def color(self, c):
        self._color = c

    @property
    def cost(self):
        assert self._cost >= 0
        return self._cost

    @cost.setter
    def cost(self, c):
        self._cost = c

    @property
    def left(self):
        return self._left

    @left.setter
    def left(self, left):
        self._left = left

    @property
    def right(self):
        return self._right

    @right.setter
    def right(self, right):
        self._right = right

    @property
    def split_on(self):
        return self._split_on

    @split_on.setter
    def split_on(self, s):
        self._split_on = s

    def print_partition(self):
        '''
        Print the coordinates of the rectangles that correspond to the
        partitioning represented by this tree.
        '''
        if self._split_on == "no_split":
            ((x0, y0), (x1, y1)) = self._R.coords
            print("(({}, {}), ({}, {})): {}".format(x0, y0, x1, y1, self._color))
            return

        self._left.print_partition()
        self._right.print_partition()


    def classify(self, x, y):
        '''
        Classify a point using this decision tree.
        '''
        if self._split_on == "no_split":
            return self._color

        ((x0, y0), (x1, y1)) = self._R.coords
        if self.split_on == "x_split":
            mid = (x0 + x1)/2
            if x <= mid:
                return self._left.classify(x, y)
            else:
                return self._right.classify(x, y)
        else:
            mid = (y0 + y1)/2
            if y <= mid:
                return self._left.classify(x, y)
            else:
                return self._right.classify(x, y)


class Dyadic:
    '''
    Wrapper class for computing and representing a dyadic partition.
    '''
    def __init__(self, data, lmbda, max_level):
        self._memo = []
        for i in range(2**(max_level+2)):
            self._memo.append([None]*(2**(max_level+2)))
        self._root = self._compute_cost(data, Rect(0, 0, 0, 0), lmbda, max_level)


    def _compute_error(self, data, R):
        '''
        Compute the major color and the number of points from data
        that are in R and have the minority color.

        Returns: (color, error)
        '''
        ((x0, y0), (x1, y1)) = R.coords
        red_cnt = 0
        blue_cnt = 0

        for (x, y, c) in data:
            if (x0 <= x < x1) and (y0 <= y < y1):
                if c == RED:
                    red_cnt = red_cnt + 1
                else:
                    blue_cnt = blue_cnt + 1

        # return the color and the number of points that are
        # mis-classified
        if red_cnt > blue_cnt:
            return (RED, blue_cnt)
        else:
            return (BLUE, red_cnt)


    def _compute_cost(self, data, R, lmbda, level):
        '''
        Compute the optimal cost for this data and R with a leaf
        weighting factor of lmbda and the specified level

        Returns: a tree node.
        '''

        # check the memo table first
        xi = R.x_index
        yi = R.y_index
        memo_R = self._memo[xi][yi]

        if memo_R:
            return memo_R

        # compute the cost of making R a leaf
        (color, error) = self._compute_error(data, R)
        tree_node = Tree_Node(R, color, error + lmbda)

        if (level == 0) or (error == 0):
            self._memo[xi][yi] = tree_node
            return tree_node

        # compute the cost of spliting left/right
        l = self._compute_cost(data, R.left, lmbda, level-1)
        r = self._compute_cost(data, R.right, lmbda, level-1)

        # update tree if Left/Right split is better
        cost = l.cost + r.cost
        if cost < tree_node.cost:
            tree_node.cost = cost
            tree_node.split_on = "split_x"
            tree_node.left = l
            tree_node.right = r

        # compute the cost of spliting top/bottom
        t = self._compute_cost(data, R.top, lmbda, level-1)
        b = self._compute_cost(data, R.bottom, lmbda, level-1)

        # update the tree if the Top/Bottom split is better
        cost = t.cost + b.cost
        if cost < tree_node.cost:
            tree_node.cost = cost
            tree_node.split_on = "split_y"
            tree_node.left = b
            tree_node.right = t

        # update the memo table
        self._memo[xi][yi] = tree_node

        return tree_node

    def classify(self, x, y):
        '''
        Classify point x, y.
        '''
        return root.classify(x, y)

    def print_partition(self):
        '''
        Print the partitioning.
        '''
        self._root.print_partition()


def read_data(filename):
    '''
    Read the data from a csv file.
    '''
    data = []
    with open(filename) as f:
        data = [row for row in csv.reader(f)]
        data = [[float(x) for x in row] for row in data[1:]]
    return data

# quick test.
data = read_data("training.csv")
unit = Rect(0,0,0,0)
dt = Dyadic(data, 14, 10)