| | import os |
| | import random |
| | import cv2 |
| | from datetime import datetime |
| | import logging |
| |
|
| | |
| | log_file = "sample_images.log" |
| | logging.basicConfig(filename=log_file, level=logging.INFO, |
| | format='%(asctime)s - %(levelname)s - %(message)s') |
| |
|
| | def detect_faces(image_path): |
| | |
| | face_cascade = cv2.CascadeClassifier(cv2.data.haarcascades + 'haarcascade_frontalface_default.xml') |
| | |
| | |
| | image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE) |
| | if image is None: |
| | return False |
| | |
| | |
| | faces = face_cascade.detectMultiScale(image, scaleFactor=1.1, minNeighbors=5, minSize=(30, 30)) |
| | |
| | |
| | return len(faces) > 0 |
| |
|
| | def sample_images(input_folder, output_folder, sample_rate=0.2): |
| | |
| | if not os.path.exists(output_folder): |
| | os.makedirs(output_folder) |
| |
|
| | |
| | total_files = 0 |
| | sampled_files = 0 |
| | start_time = datetime.now() |
| |
|
| | |
| | for root, dirs, files in os.walk(input_folder): |
| | relative_path = os.path.relpath(root, input_folder) |
| | output_subfolder = os.path.join(output_folder, relative_path) |
| |
|
| | if not os.path.exists(output_subfolder): |
| | os.makedirs(output_subfolder) |
| |
|
| | total_files += len(files) |
| |
|
| | |
| | sampled_files_this_batch = [] |
| | for file in files: |
| | if file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')): |
| | input_file_path = os.path.join(root, file) |
| | if detect_faces(input_file_path): |
| | sampled_files_this_batch.append(file) |
| |
|
| | sampled_files += len(sampled_files_this_batch) |
| |
|
| | for file in files: |
| | if file in sampled_files_this_batch: |
| | input_file_path = os.path.join(root, file) |
| | output_file_path = os.path.join(output_subfolder, file) |
| | os.link(input_file_path, output_file_path) |
| | |
| | |
| | logging.info(f"Sampled and copied {input_file_path} to {output_file_path}") |
| |
|
| | elapsed_time = datetime.now() - start_time |
| | print(f"Processed {sampled_files}/{total_files} files in {elapsed_time}") |
| |
|
| | end_time = datetime.now() |
| | total_time = end_time - start_time |
| | logging.info(f"Total time taken: {total_time}") |
| | logging.info(f"Sampled {sampled_files} out of {total_files} files.") |
| |
|
| | if __name__ == "__main__": |
| | input_folder = "EvalSet" |
| | output_folder = "resampledEvalSet" |
| | sample_images(input_folder, output_folder) |
| |
|