半监督自学习 Semi-supervised-training

AutoST class

Automated Self-training for semi-supervised tasks.
Source code in semisupervised\autost.py
class AutoST:
    """
    Automated Self-training for semi-supervised tasks.
    """

    def __init__(self,
                 num_iterations,
                 entropy_filter_percentile,
                 lambda_uncertainty_final,
                 prob_threshold_final,
                 pseudo_label_ratio_final,
                 lambda_uncertainty_initial,
                 prob_threshold_initial,
                 pseudo_label_ratio_initial,
                 **kwargs):
        self.num_iterations = num_iterations
        self.entropy_filter_percentile = entropy_filter_percentile
        self.lambda_uncertainty_min = lambda_uncertainty_final
        self.prob_threshold_min = prob_threshold_final
        self.pseudo_label_ratio_min = pseudo_label_ratio_final
        self.lambda_uncertainty_max = lambda_uncertainty_initial
        self.prob_threshold_max = prob_threshold_initial
        self.pseudo_label_ratio_max = pseudo_label_ratio_initial
        self.kwargs = kwargs
        self.results_path = kwargs.get('results_path', '')

    def fit(self, X_labeled, y_labeled, X_unlabeled):
        # Create the results folder if it does not exist
        if not os.path.exists(self.results_path):
            os.makedirs(self.results_path)
            print(f"Results folder created: {self.results_path}")

        # Calculate the step size for each parameter
        lambda_uncertainty_step = (self.lambda_uncertainty_max - self.lambda_uncertainty_min) / self.num_iterations
        prob_threshold_step = (self.prob_threshold_max - self.prob_threshold_min) / self.num_iterations
        pseudo_label_ratio_step = (self.pseudo_label_ratio_max - self.pseudo_label_ratio_min) / self.num_iterations

        for i in range(self.num_iterations):
            print(f'****Self_training epoch {i} start****')

            # Update the parameters for this iteration
            lambda_uncertainty = self.lambda_uncertainty_min + i * lambda_uncertainty_step
            prob_threshold = self.prob_threshold_min + i * prob_threshold_step
            pseudo_label_ratio = self.pseudo_label_ratio_min + i * pseudo_label_ratio_step

            # Update the results_path for this iteration
            self.kwargs['results_path'] = f"{self.results_path}/iteration_{i+1}"

            # Initialize the AutoML system with the updated results_path
            automl = AutoML(**self.kwargs)

            # Train the model on the labeled data
            automl.fit(X_labeled, y_labeled)

            # Get the prediction probabilities of the unlabeled data
            pred_prob = automl.predict_proba(X_unlabeled)

            # Compute the entropy of the prediction probabilities
            entropy_uncertainty = -np.sum(pred_prob * np.log(pred_prob + 1e-16), axis=1)

            # Compute the uncertainty-corrected prediction probabilities
            pred_prob_corrected = pred_prob - lambda_uncertainty * entropy_uncertainty[:, np.newaxis]

            # Compute the pseudo-labels based on the corrected prediction probabilities
            pseudo_labels = np.argmax(pred_prob_corrected, axis=1)

            # Compute the percentile of the entropy_uncertainty
            entropy_uncertainty_percentile = np.percentile(
                entropy_uncertainty, 100 - self.entropy_filter_percentile
            )

            # Only select the samples where the entropy_uncertainty is below the 97th percentile
            entropy_filter = (entropy_uncertainty < entropy_uncertainty_percentile)

            # Combine the original condition with the entropy filter
            selected_samples = (np.max(pred_prob_corrected, axis=1) > prob_threshold) & entropy_filter

            # Create a DataFrame from the unlabeled data and pseudo-labels
            data_unlabeled = pd.DataFrame(X_unlabeled[selected_samples])
            data_unlabeled['pseudo_label'] = pseudo_labels[selected_samples]

            # Downsample the data
            data_unlabeled_downsampled = data_unlabeled.sample(frac=pseudo_label_ratio, random_state=42)

            # Extract the downsampled data and labels
            X_unlabeled_selected = data_unlabeled_downsampled.drop(columns='pseudo_label')
            pseudo_labels_selected = data_unlabeled_downsampled['pseudo_label']

            # Combine the pseudo-labeled and labeled data
            X_labeled = pd.concat([X_labeled, X_unlabeled_selected])
            y_labeled = pd.concat([y_labeled, pd.Series(pseudo_labels_selected)])

            print(f'****Self_training epoch {i} finish****')