1import sklearn
2import numpy as np
3from matplotlib import pyplot as plt
4
5
6class WeakClassifier:
7 def __init__(self, nFeatures):
8 # The alpha value for later if this classifier is picked into the cascade
9 self.alpha = 1.0
10
11 # The weak classifier will pick two random dimensions out of the feature vector
12 # It will classify a sample as positive if featureA is bigger or equal than featureB
13 self.featureA = int(np.random.uniform(0, nFeatures))
14 self.featureB = int(np.random.uniform(0, nFeatures))
15
16 def predict(self, samples):
17 values = samples[:, self.featureA] - samples[:, self.featureB]
18 return 2 * (values >= 0) - 1
19
20
21def load_data():
22 # Load the Digits dataset
23 # The digits dataset contains images of resolution 8x8 pixels. Each pixel contains values between 0 and 15.
24 # They resemble images of the hand-written digits 0 to 9
25 digits = sklearn.datasets.load_digits()
26
27 # Select two digits for classification. Flatten the images as we don´t need the 2D structure anyway
28 positive_class = digits.images[digits.target == 2].reshape(-1, 64)
29 negative_class = digits.images[digits.target == 8].reshape(-1, 64)
30 positive_label = np.ones(positive_class.shape[0])
31 negative_label = -np.ones(negative_class.shape[0])
32
33 # Concatenate both into the same set
34 data = np.concatenate([positive_class, negative_class])
35 labels = np.concatenate([positive_label, negative_label])
36
37 # Start with equal weights for each sample
38 weights = np.ones_like(labels)
39
40 return data, labels, weights
41
42
43def generate_weak_classifiers():
44 # Generate a random selection of weak classifiers. The AdaBoost Algorithm
45 # will pick one of these for the next cascade stage
46 weakClassifiers = []
47
48 for _ in range(8):
49 weakClassifiers.append(WeakClassifier(64))
50
51 return weakClassifiers
52
53
54def pick_weak_classifiers(data, labels, weights, classifiers):
55 # We try to find the one classifier out of the given classifiers
56 # which minimize the sum of weights for wrongly classifier samples
57 minimalSum = None
58 bestClassifier = None
59
60 # Iterate over all options
61 for classifier in classifiers:
62 # Make a prediction for each samples
63 predictions = classifier.predict(data)
64
65 # Wrong samples are those whose prediction differs from the label
66 wrong = predictions != labels
67
68 # Sum the current weights for wrongly predicted samples
69 sumW = weights[wrong].sum()
70
71 # If this is lower, keep this classifier as current best
72 if bestClassifier is None or sumW < minimalSum:
73 bestClassifier = classifier
74 minimalSum = sumW
75
76 # Return best classifier
77 return bestClassifier
78
79
80def build_one_stage(data, labels, weights, classifiers, cascade):
81 # Pick the best weak classifier given current weights
82 classifier = pick_weak_classifiers(data, labels, weights, classifiers)
83
84 # Calculate predictions
85 predictions = classifier.predict(data)
86 wrong = predictions != labels
87
88 # Calculate weighted error sum
89 e = weights[wrong].sum() / weights.sum()
90
91 # Calculate alpha value
92 alpha = 0.5 * np.log((1 - e) / e)
93 # print(e, alpha)
94
95 # Update weights for each samples
96 weights = weights * np.exp(-alpha * classifier.predict(data) * labels)
97
98 # Remember alpha and add to cascade
99 classifier.alpha = alpha
100 cascade.append(classifier)
101
102 return weights, cascade
103
104
105def predict_cascade(data, cascade):
106 # Evaluate the cascaded classifier
107 # This is the weighted (with alpha) sum of all individual classification decisions
108 values = np.zeros(data.shape[0])
109 for classifier in cascade:
110 values = values + classifier.alpha * classifier.predict(data)
111
112 return 2 * (values >= 0) - 1
113
114
115# Load the data
116data, labels, weights = load_data()
117
118fig = plt.figure(figsize=(6, 6)) # figure size in inches
119fig.subplots_adjust(left=0, right=1, bottom=0, top=1, hspace=0.05, wspace=0.05)
120
121for i in range(64):
122 ax = fig.add_subplot(8, 8, i + 1, xticks=[], yticks=[])
123 ax.imshow(data[i, :].reshape(8, 8), cmap=plt.cm.binary, interpolation="nearest")
124
125plt.show()
126
127# Start with an empty cascade
128cascade = []
129
130# Add 50 weak classifiers
131for i in range(50):
132 # Generate a new set of weak classifier
133 classifiers = generate_weak_classifiers()
134
135 # Pick one and re-evaluate the weights for each samples
136 weights, cascade = build_one_stage(data, labels, weights, classifiers, cascade)
137
138 # Calculate predictions for the whole cascade
139 predictions = predict_cascade(data, cascade)
140
141 # Count wrong samples
142 wrong = predictions != labels
143 total_wrong = wrong.sum()
144
145 # Also calculate total error value
146 E = np.sum(np.exp(-predictions * labels))
147
148 # Output
149 print(f"Stage {i}, E={E:.5f}, total wrong = {total_wrong}")