#!/usr/bin/env python3 """ This script demonstrates how to preprocess a new dataset for transfer learning. It focuses on ensuring compatibility with a pre-trained model, including handling image resizing, normalization, and label encoding. """ import os import sys from PIL import Image import numpy as np import pandas as pd from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split def load_and_preprocess_images(image_dir, target_size=(224, 224), grayscale=False): """ Loads images from a directory, resizes them, and optionally converts them to grayscale. Args: image_dir (str): Path to the directory containing the images. target_size (tuple): The desired size (width, height) of the images. grayscale (bool): Whether to convert images to grayscale. Returns: tuple: A tuple containing a list of preprocessed image arrays and a list of corresponding filenames. Returns None, None if an error occurs. """ images = [] filenames = [] try: for filename in os.listdir(image_dir): if filename.endswith(('.jpg', '.jpeg', '.png')): image_path = os.path.join(image_dir, filename) try: img = Image.open(image_path) if grayscale: img = img.convert('L') # Convert to grayscale img = img.resize(target_size) img_array = np.array(img) # Ensure images are 3-channel even if grayscale if grayscale and len(img_array.shape) == 2: img_array = np.stack([img_array] * 3, axis=-1) elif len(img_array.shape) == 2: img_array = np.stack([img_array] * 3, axis=-1) images.append(img_array) filenames.append(filename) except (IOError, OSError) as e: print(f"Error processing image {filename}: {e}") return images, filenames except OSError as e: print(f"Error accessing image directory {image_dir}: {e}") return None, None def normalize_images(images): """ Normalizes pixel values of images to the range [0, 1]. Args: images (list): A list of image arrays. Returns: list: A list of normalized image arrays. """ normalized_images = [img / 255.0 for img in images] return normalized_images def encode_labels(labels): """ Encodes categorical labels into numerical values using LabelEncoder. Args: labels (list): A list of categorical labels. Returns: numpy.ndarray: An array of encoded labels. """ label_encoder = LabelEncoder() encoded_labels = label_encoder.fit_transform(labels) return encoded_labels def create_dataframe(images, labels, filenames): """ Creates a pandas DataFrame from the preprocessed images, labels, and filenames. Args: images (list): A list of preprocessed image arrays. labels (numpy.ndarray): An array of encoded labels. filenames (list): A list of filenames. Returns: pandas.DataFrame: A DataFrame containing the image data, labels, and filenames. """ df = pd.DataFrame({'image': images, 'label': labels, 'filename': filenames}) return df def split_data(df, test_size=0.2, random_state=42): """ Splits the data into training and testing sets. Args: df (pandas.DataFrame): The DataFrame containing the data. test_size (float): The proportion of the data to use for testing. random_state (int): The random state for reproducibility. Returns: tuple: A tuple containing the training and testing DataFrames. """ train_df, test_df = train_test_split(df, test_size=test_size, random_state=random_state) return train_df, test_df def main(image_dir): """ Main function to demonstrate the data preprocessing steps. Args: image_dir (str): Path to the directory containing the images. """ images, filenames = load_and_preprocess_images(image_dir) if images is None or filenames is None: print("Error loading images. Exiting.") return # Example labels (replace with your actual labels) labels = [filename.split('_')[0] for filename in filenames] # Assuming filename format: label_image_id.jpg encoded_labels = encode_labels(labels) normalized_images = normalize_images(images) df = create_dataframe(normalized_images, encoded_labels, filenames) train_df, test_df = split_data(df) print("Training DataFrame shape:", train_df.shape) print("Testing DataFrame shape:", test_df.shape) print("First 5 rows of training DataFrame:") print(train_df.head()) if __name__ == "__main__": if len(sys.argv) > 1: image_directory = sys.argv[1] main(image_directory) else: print("Please provide the image directory as a command-line argument.") print("Example: python data_preprocessing_example.py path/to/images")