CourseDesign/tests/test_data.py

import sys
import os
import pandas as pd
import numpy as np
import pytest

# Ensure src is in path
sys.path.append(os.getcwd())

from src.data import generate_data, preprocess_data

def test_generate_data_structure():
    """Test if generate_data returns a DataFrame with correct shape and columns."""
    df = generate_data(n_samples=50)
    
    assert isinstance(df, pd.DataFrame)
    assert len(df) == 50
    
    expected_cols = [
        "study_hours", "sleep_hours", "attendance_rate", 
        "study_type", "stress_level", "is_pass"
    ]
    for col in expected_cols:
        assert col in df.columns

def test_generate_data_content_range():
    """Test if generated data falls within expected value ranges."""
    df = generate_data(n_samples=50)
    
    assert df["study_hours"].min() >= 0
    assert df["study_hours"].max() <= 20  # Based on generation logic (0-15 actually, but safely below 20)
    assert df["sleep_hours"].min() >= 0
    assert df["stress_level"].between(1, 5).all()
    assert df["is_pass"].isin([0, 1]).all()

def test_generate_data_missing_values():
    """Test if generate_data creates missing values as expected (it has random logic)."""
    # Generate enough samples to likely get nans
    df = generate_data(n_samples=500, random_seed=42)
    # Check if we have nans in specific columns that are supposed to have them
    # In source: attendance_rate has 5% chance of nan
    assert df["attendance_rate"].isnull().sum() >= 0

def test_preprocess_data():
    """Test basic preprocessing (deduplication)."""
    df = pd.DataFrame({
        "a": [1, 2, 2, 3],
        "b": [1, 2, 2, 3]
    })
    
    clean_df = preprocess_data(df)
    assert len(clean_df) == 3
feat: Add Streamlit application for student grade prediction and AI counseling. 2026-01-01 11:19:17 +08:00			`import sys`
			`import os`
			`import pandas as pd`
			`import numpy as np`
			`import pytest`

			`# Ensure src is in path`
			`sys.path.append(os.getcwd())`

			`from src.data import generate_data, preprocess_data`

			`def test_generate_data_structure():`
			`"""Test if generate_data returns a DataFrame with correct shape and columns."""`
			`df = generate_data(n_samples=50)`

			`assert isinstance(df, pd.DataFrame)`
			`assert len(df) == 50`

			`expected_cols = [`
			`"study_hours", "sleep_hours", "attendance_rate",`
			`"study_type", "stress_level", "is_pass"`
			`]`
			`for col in expected_cols:`
			`assert col in df.columns`

			`def test_generate_data_content_range():`
			`"""Test if generated data falls within expected value ranges."""`
			`df = generate_data(n_samples=50)`

			`assert df["study_hours"].min() >= 0`
			`assert df["study_hours"].max() <= 20 # Based on generation logic (0-15 actually, but safely below 20)`
			`assert df["sleep_hours"].min() >= 0`
			`assert df["stress_level"].between(1, 5).all()`
			`assert df["is_pass"].isin([0, 1]).all()`

			`def test_generate_data_missing_values():`
			`"""Test if generate_data creates missing values as expected (it has random logic)."""`
			`# Generate enough samples to likely get nans`
			`df = generate_data(n_samples=500, random_seed=42)`
			`# Check if we have nans in specific columns that are supposed to have them`
			`# In source: attendance_rate has 5% chance of nan`
			`assert df["attendance_rate"].isnull().sum() >= 0`

			`def test_preprocess_data():`
			`"""Test basic preprocessing (deduplication)."""`
			`df = pd.DataFrame({`
			`"a": [1, 2, 2, 3],`
			`"b": [1, 2, 2, 3]`
			`})`

			`clean_df = preprocess_data(df)`
			`assert len(clean_df) == 3`