Automated Self-training for semi-supervised tasks.
Source code in semisupervised\autost.py
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101 | class AutoST:
"""
Automated Self-training for semi-supervised tasks.
"""
def __init__(self,
num_iterations,
entropy_filter_percentile,
lambda_uncertainty_final,
prob_threshold_final,
pseudo_label_ratio_final,
lambda_uncertainty_initial,
prob_threshold_initial,
pseudo_label_ratio_initial,
**kwargs):
self.num_iterations = num_iterations
self.entropy_filter_percentile = entropy_filter_percentile
self.lambda_uncertainty_min = lambda_uncertainty_final
self.prob_threshold_min = prob_threshold_final
self.pseudo_label_ratio_min = pseudo_label_ratio_final
self.lambda_uncertainty_max = lambda_uncertainty_initial
self.prob_threshold_max = prob_threshold_initial
self.pseudo_label_ratio_max = pseudo_label_ratio_initial
self.kwargs = kwargs
self.results_path = kwargs.get('results_path', '')
def fit(self, X_labeled, y_labeled, X_unlabeled):
# Create the results folder if it does not exist
if not os.path.exists(self.results_path):
os.makedirs(self.results_path)
print(f"Results folder created: {self.results_path}")
# Calculate the step size for each parameter
lambda_uncertainty_step = (self.lambda_uncertainty_max - self.lambda_uncertainty_min) / self.num_iterations
prob_threshold_step = (self.prob_threshold_max - self.prob_threshold_min) / self.num_iterations
pseudo_label_ratio_step = (self.pseudo_label_ratio_max - self.pseudo_label_ratio_min) / self.num_iterations
for i in range(self.num_iterations):
print(f'****Self_training epoch {i} start****')
# Update the parameters for this iteration
lambda_uncertainty = self.lambda_uncertainty_min + i * lambda_uncertainty_step
prob_threshold = self.prob_threshold_min + i * prob_threshold_step
pseudo_label_ratio = self.pseudo_label_ratio_min + i * pseudo_label_ratio_step
# Update the results_path for this iteration
self.kwargs['results_path'] = f"{self.results_path}/iteration_{i+1}"
# Initialize the AutoML system with the updated results_path
automl = AutoML(**self.kwargs)
# Train the model on the labeled data
automl.fit(X_labeled, y_labeled)
# Get the prediction probabilities of the unlabeled data
pred_prob = automl.predict_proba(X_unlabeled)
# Compute the entropy of the prediction probabilities
entropy_uncertainty = -np.sum(pred_prob * np.log(pred_prob + 1e-16), axis=1)
# Compute the uncertainty-corrected prediction probabilities
pred_prob_corrected = pred_prob - lambda_uncertainty * entropy_uncertainty[:, np.newaxis]
# Compute the pseudo-labels based on the corrected prediction probabilities
pseudo_labels = np.argmax(pred_prob_corrected, axis=1)
# Compute the percentile of the entropy_uncertainty
entropy_uncertainty_percentile = np.percentile(
entropy_uncertainty, 100 - self.entropy_filter_percentile
)
# Only select the samples where the entropy_uncertainty is below the 97th percentile
entropy_filter = (entropy_uncertainty < entropy_uncertainty_percentile)
# Combine the original condition with the entropy filter
selected_samples = (np.max(pred_prob_corrected, axis=1) > prob_threshold) & entropy_filter
# Create a DataFrame from the unlabeled data and pseudo-labels
data_unlabeled = pd.DataFrame(X_unlabeled[selected_samples])
data_unlabeled['pseudo_label'] = pseudo_labels[selected_samples]
# Downsample the data
data_unlabeled_downsampled = data_unlabeled.sample(frac=pseudo_label_ratio, random_state=42)
# Extract the downsampled data and labels
X_unlabeled_selected = data_unlabeled_downsampled.drop(columns='pseudo_label')
pseudo_labels_selected = data_unlabeled_downsampled['pseudo_label']
# Combine the pseudo-labeled and labeled data
X_labeled = pd.concat([X_labeled, X_unlabeled_selected])
y_labeled = pd.concat([y_labeled, pd.Series(pseudo_labels_selected)])
print(f'****Self_training epoch {i} finish****')
|