Skip to content

Harvey

harvey

Harvey Dataset Loader

Loads preprocessed Harvey nanobody polyreactivity dataset.

IMPORTANT: This module is for LOADING preprocessed data, not for running the preprocessing pipeline. The preprocessing scripts that CREATE the data are in: preprocessing/harvey/step2_extract_fragments.py

Dataset characteristics: - Nanobodies (VHH only, no light chain) - High-throughput screen data (141,474 sequences; 141,021 ANARCI-validated fragment exports) - Binary classification: high/low polyreactivity - IMGT-numbered positions in raw data - 6 fragment types (VHH-specific)

Source: - data/test/harvey/raw/high_polyreactivity_high_throughput.csv - data/test/harvey/raw/low_polyreactivity_high_throughput.csv

Reference: - Harvey et al., "An in silico method to assess antibody fragment polyreactivity" (Nature Communications, 2022)

Classes

HarveyDataset

Bases: AntibodyDataset

Loader for Harvey nanobody dataset.

This class provides an interface to LOAD preprocessed Harvey dataset files. It does NOT run the preprocessing pipeline - use preprocessing/harvey/step2_extract_fragments.py for that.

The Harvey dataset contains VHH sequences (heavy chain only, no light chain) from a high-throughput polyreactivity screen. Sequences are provided with IMGT numbering and pre-extracted CDR regions.

Source code in src/antibody_training_esm/datasets/harvey.py
class HarveyDataset(AntibodyDataset):
    """
    Loader for Harvey nanobody dataset.

    This class provides an interface to LOAD preprocessed Harvey dataset files.
    It does NOT run the preprocessing pipeline - use preprocessing/harvey/step2_extract_fragments.py for that.

    The Harvey dataset contains VHH sequences (heavy chain only, no light chain) from
    a high-throughput polyreactivity screen. Sequences are provided with IMGT
    numbering and pre-extracted CDR regions.
    """

    def __init__(
        self, output_dir: Path | None = None, logger: logging.Logger | None = None
    ):
        """
        Initialize Harvey dataset loader.

        Args:
            output_dir: Directory containing preprocessed fragment files
            logger: Logger instance
        """
        super().__init__(
            dataset_name="harvey",
            output_dir=output_dir or HARVEY_OUTPUT_DIR,
            logger=logger,
        )

    @classmethod
    def get_schema(cls) -> pa.DataFrameSchema:
        return get_harvey_schema()

    def get_fragment_types(self) -> list[str]:
        """
        Return nanobody-specific fragment types.

        Harvey contains VHH sequences only (no light chain), so we generate
        6 fragment types instead of the full 16.

        Returns:
            List of 6 nanobody fragment types
        """
        return self.NANOBODY_FRAGMENTS

    def extract_sequence_from_imgt(self, row: pd.Series, imgt_cols: list[str]) -> str:
        """
        Extract full sequence from IMGT-numbered position columns.

        The Harvey raw data contains columns "1" through "128" representing
        IMGT numbering positions. This method concatenates non-gap positions
        to reconstruct the full sequence.

        Args:
            row: DataFrame row with IMGT position columns
            imgt_cols: List of column names ['1', '2', ..., '128']

        Returns:
            Full sequence string with gaps removed
        """
        positions = []
        for col in imgt_cols:
            if col in row and pd.notna(row[col]) and row[col] != "-":
                positions.append(row[col])
        return "".join(positions)

    def load_data(
        self,
        high_csv_path: str | Path | None = None,
        low_csv_path: str | Path | None = None,
        **_: Any,
    ) -> pd.DataFrame:
        """
        Load Harvey dataset from high/low polyreactivity CSV files.

        Args:
            high_csv_path: Path to high_polyreactivity_high_throughput.csv
            low_csv_path: Path to low_polyreactivity_high_throughput.csv

        Returns:
            DataFrame with columns: id, VH_sequence, label

        Raises:
            FileNotFoundError: If input CSV files not found
        """
        # Default paths
        if high_csv_path is None:
            high_csv_path = HARVEY_HIGH_POLY_CSV
        if low_csv_path is None:
            low_csv_path = HARVEY_LOW_POLY_CSV

        # Validate paths
        high_csv = Path(high_csv_path)
        low_csv = Path(low_csv_path)

        if not high_csv.exists():
            raise FileNotFoundError(
                f"High polyreactivity CSV not found: {high_csv}\n"
                f"Please ensure raw files are in data/test/harvey/raw/"
            )

        if not low_csv.exists():
            raise FileNotFoundError(
                f"Low polyreactivity CSV not found: {low_csv}\n"
                f"Please ensure raw files are in data/test/harvey/raw/"
            )

        # Load datasets
        self.logger.info(f"Reading high polyreactivity data from {high_csv}...")
        df_high = pd.read_csv(high_csv)
        if len(df_high) == 0:
            raise ValueError(
                f"Loaded dataset is empty: {high_csv}\n"
                "The CSV file may be corrupted or truncated. "
                "Please check the file or re-run preprocessing."
            )
        self.logger.info(f"  Loaded {len(df_high)} high polyreactivity sequences")

        self.logger.info(f"Reading low polyreactivity data from {low_csv}...")
        df_low = pd.read_csv(low_csv)
        if len(df_low) == 0:
            raise ValueError(
                f"Loaded dataset is empty: {low_csv}\n"
                "The CSV file may be corrupted or truncated. "
                "Please check the file or re-run preprocessing."
            )
        self.logger.info(f"  Loaded {len(df_low)} low polyreactivity sequences")

        # IMGT position columns (1-128)
        imgt_cols = [str(i) for i in range(1, 129)]

        # Extract full sequences from IMGT positions
        self.logger.info("Extracting sequences from IMGT positions...")
        df_high["VH_sequence"] = df_high.apply(
            lambda row: self.extract_sequence_from_imgt(row, imgt_cols), axis=1
        )
        df_low["VH_sequence"] = df_low.apply(
            lambda row: self.extract_sequence_from_imgt(row, imgt_cols), axis=1
        )

        # Add binary labels
        df_high["label"] = 1  # high polyreactivity = non-specific
        df_low["label"] = 0  # low polyreactivity = specific

        # Combine datasets
        self.logger.info("Combining high and low polyreactivity datasets...")
        df_combined = pd.concat([df_high, df_low], ignore_index=True)

        # Create sequence IDs
        df_combined["id"] = [f"harvey_{i:06d}" for i in range(len(df_combined))]

        # Select standardized columns
        df_output = df_combined[["id", "VH_sequence", "label"]].copy()

        # Filter out empty sequences
        empty_mask = df_output["VH_sequence"].str.len() == 0
        if empty_mask.any():
            n_empty = empty_mask.sum()
            self.logger.warning(f"Removing {n_empty} sequences with zero length")
            df_output = df_output[~empty_mask].reset_index(drop=True)

        # Create 'sequence' column for schema validation (use VH)
        if "sequence" not in df_output.columns and "VH_sequence" in df_output.columns:
            df_output["sequence"] = df_output["VH_sequence"]

        # Validate with Pandera
        df_output = self.validate_dataframe(df_output)

        self.logger.info(f"Combined dataset: {len(df_output)} sequences")
        self.logger.info(
            f"  High polyreactivity (label=1): {(df_output['label'] == 1).sum()}"
        )
        self.logger.info(
            f"  Low polyreactivity (label=0): {(df_output['label'] == 0).sum()}"
        )

        # Sequence length stats
        seq_lengths = df_output["VH_sequence"].str.len()
        self.logger.info(
            f"Sequence length range: {seq_lengths.min()}-{seq_lengths.max()} aa "
            f"(mean: {seq_lengths.mean():.1f})"
        )

        return df_output
Functions
get_fragment_types()

Return nanobody-specific fragment types.

Harvey contains VHH sequences only (no light chain), so we generate 6 fragment types instead of the full 16.

Returns:

Type Description
list[str]

List of 6 nanobody fragment types

Source code in src/antibody_training_esm/datasets/harvey.py
def get_fragment_types(self) -> list[str]:
    """
    Return nanobody-specific fragment types.

    Harvey contains VHH sequences only (no light chain), so we generate
    6 fragment types instead of the full 16.

    Returns:
        List of 6 nanobody fragment types
    """
    return self.NANOBODY_FRAGMENTS
extract_sequence_from_imgt(row, imgt_cols)

Extract full sequence from IMGT-numbered position columns.

The Harvey raw data contains columns "1" through "128" representing IMGT numbering positions. This method concatenates non-gap positions to reconstruct the full sequence.

Parameters:

Name Type Description Default
row Series

DataFrame row with IMGT position columns

required
imgt_cols list[str]

List of column names ['1', '2', ..., '128']

required

Returns:

Type Description
str

Full sequence string with gaps removed

Source code in src/antibody_training_esm/datasets/harvey.py
def extract_sequence_from_imgt(self, row: pd.Series, imgt_cols: list[str]) -> str:
    """
    Extract full sequence from IMGT-numbered position columns.

    The Harvey raw data contains columns "1" through "128" representing
    IMGT numbering positions. This method concatenates non-gap positions
    to reconstruct the full sequence.

    Args:
        row: DataFrame row with IMGT position columns
        imgt_cols: List of column names ['1', '2', ..., '128']

    Returns:
        Full sequence string with gaps removed
    """
    positions = []
    for col in imgt_cols:
        if col in row and pd.notna(row[col]) and row[col] != "-":
            positions.append(row[col])
    return "".join(positions)
load_data(high_csv_path=None, low_csv_path=None, **_)

Load Harvey dataset from high/low polyreactivity CSV files.

Parameters:

Name Type Description Default
high_csv_path str | Path | None

Path to high_polyreactivity_high_throughput.csv

None
low_csv_path str | Path | None

Path to low_polyreactivity_high_throughput.csv

None

Returns:

Type Description
DataFrame

DataFrame with columns: id, VH_sequence, label

Raises:

Type Description
FileNotFoundError

If input CSV files not found

Source code in src/antibody_training_esm/datasets/harvey.py
def load_data(
    self,
    high_csv_path: str | Path | None = None,
    low_csv_path: str | Path | None = None,
    **_: Any,
) -> pd.DataFrame:
    """
    Load Harvey dataset from high/low polyreactivity CSV files.

    Args:
        high_csv_path: Path to high_polyreactivity_high_throughput.csv
        low_csv_path: Path to low_polyreactivity_high_throughput.csv

    Returns:
        DataFrame with columns: id, VH_sequence, label

    Raises:
        FileNotFoundError: If input CSV files not found
    """
    # Default paths
    if high_csv_path is None:
        high_csv_path = HARVEY_HIGH_POLY_CSV
    if low_csv_path is None:
        low_csv_path = HARVEY_LOW_POLY_CSV

    # Validate paths
    high_csv = Path(high_csv_path)
    low_csv = Path(low_csv_path)

    if not high_csv.exists():
        raise FileNotFoundError(
            f"High polyreactivity CSV not found: {high_csv}\n"
            f"Please ensure raw files are in data/test/harvey/raw/"
        )

    if not low_csv.exists():
        raise FileNotFoundError(
            f"Low polyreactivity CSV not found: {low_csv}\n"
            f"Please ensure raw files are in data/test/harvey/raw/"
        )

    # Load datasets
    self.logger.info(f"Reading high polyreactivity data from {high_csv}...")
    df_high = pd.read_csv(high_csv)
    if len(df_high) == 0:
        raise ValueError(
            f"Loaded dataset is empty: {high_csv}\n"
            "The CSV file may be corrupted or truncated. "
            "Please check the file or re-run preprocessing."
        )
    self.logger.info(f"  Loaded {len(df_high)} high polyreactivity sequences")

    self.logger.info(f"Reading low polyreactivity data from {low_csv}...")
    df_low = pd.read_csv(low_csv)
    if len(df_low) == 0:
        raise ValueError(
            f"Loaded dataset is empty: {low_csv}\n"
            "The CSV file may be corrupted or truncated. "
            "Please check the file or re-run preprocessing."
        )
    self.logger.info(f"  Loaded {len(df_low)} low polyreactivity sequences")

    # IMGT position columns (1-128)
    imgt_cols = [str(i) for i in range(1, 129)]

    # Extract full sequences from IMGT positions
    self.logger.info("Extracting sequences from IMGT positions...")
    df_high["VH_sequence"] = df_high.apply(
        lambda row: self.extract_sequence_from_imgt(row, imgt_cols), axis=1
    )
    df_low["VH_sequence"] = df_low.apply(
        lambda row: self.extract_sequence_from_imgt(row, imgt_cols), axis=1
    )

    # Add binary labels
    df_high["label"] = 1  # high polyreactivity = non-specific
    df_low["label"] = 0  # low polyreactivity = specific

    # Combine datasets
    self.logger.info("Combining high and low polyreactivity datasets...")
    df_combined = pd.concat([df_high, df_low], ignore_index=True)

    # Create sequence IDs
    df_combined["id"] = [f"harvey_{i:06d}" for i in range(len(df_combined))]

    # Select standardized columns
    df_output = df_combined[["id", "VH_sequence", "label"]].copy()

    # Filter out empty sequences
    empty_mask = df_output["VH_sequence"].str.len() == 0
    if empty_mask.any():
        n_empty = empty_mask.sum()
        self.logger.warning(f"Removing {n_empty} sequences with zero length")
        df_output = df_output[~empty_mask].reset_index(drop=True)

    # Create 'sequence' column for schema validation (use VH)
    if "sequence" not in df_output.columns and "VH_sequence" in df_output.columns:
        df_output["sequence"] = df_output["VH_sequence"]

    # Validate with Pandera
    df_output = self.validate_dataframe(df_output)

    self.logger.info(f"Combined dataset: {len(df_output)} sequences")
    self.logger.info(
        f"  High polyreactivity (label=1): {(df_output['label'] == 1).sum()}"
    )
    self.logger.info(
        f"  Low polyreactivity (label=0): {(df_output['label'] == 0).sum()}"
    )

    # Sequence length stats
    seq_lengths = df_output["VH_sequence"].str.len()
    self.logger.info(
        f"Sequence length range: {seq_lengths.min()}-{seq_lengths.max()} aa "
        f"(mean: {seq_lengths.mean():.1f})"
    )

    return df_output

Functions

load_harvey_data(high_csv=None, low_csv=None)

Convenience function to load preprocessed Harvey dataset.

IMPORTANT: This loads PREPROCESSED data. To preprocess raw data, use: preprocessing/harvey/step2_extract_fragments.py

Parameters:

Name Type Description Default
high_csv str | None

Path to high polyreactivity CSV

None
low_csv str | None

Path to low polyreactivity CSV

None

Returns:

Type Description
DataFrame

DataFrame with preprocessed data

Example

from antibody_training_esm.datasets.harvey import load_harvey_data df = load_harvey_data() print(f"Loaded {len(df)} sequences")

Source code in src/antibody_training_esm/datasets/harvey.py
def load_harvey_data(
    high_csv: str | None = None,
    low_csv: str | None = None,
) -> pd.DataFrame:
    """
    Convenience function to load preprocessed Harvey dataset.

    IMPORTANT: This loads PREPROCESSED data. To preprocess raw data, use:
    preprocessing/harvey/step2_extract_fragments.py

    Args:
        high_csv: Path to high polyreactivity CSV
        low_csv: Path to low polyreactivity CSV

    Returns:
        DataFrame with preprocessed data

    Example:
        >>> from antibody_training_esm.datasets.harvey import load_harvey_data
        >>> df = load_harvey_data()
        >>> print(f"Loaded {len(df)} sequences")
    """
    dataset = HarveyDataset()
    return dataset.load_data(high_csv_path=high_csv, low_csv_path=low_csv)