Harvey Dataset Loader
Loads preprocessed Harvey nanobody polyreactivity dataset.
IMPORTANT: This module is for LOADING preprocessed data, not for running
the preprocessing pipeline. The preprocessing scripts that CREATE the data
are in: preprocessing/harvey/step2_extract_fragments.py
Dataset characteristics:
- Nanobodies (VHH only, no light chain)
- High-throughput screen data (141,474 sequences; 141,021 ANARCI-validated fragment exports)
- Binary classification: high/low polyreactivity
- IMGT-numbered positions in raw data
- 6 fragment types (VHH-specific)
Source:
- data/test/harvey/raw/high_polyreactivity_high_throughput.csv
- data/test/harvey/raw/low_polyreactivity_high_throughput.csv
Reference:
- Harvey et al., "An in silico method to assess antibody fragment polyreactivity" (Nature Communications, 2022)
Classes
HarveyDataset
Bases: AntibodyDataset
Loader for Harvey nanobody dataset.
This class provides an interface to LOAD preprocessed Harvey dataset files.
It does NOT run the preprocessing pipeline - use preprocessing/harvey/step2_extract_fragments.py for that.
The Harvey dataset contains VHH sequences (heavy chain only, no light chain) from
a high-throughput polyreactivity screen. Sequences are provided with IMGT
numbering and pre-extracted CDR regions.
Source code in src/antibody_training_esm/datasets/harvey.py
| class HarveyDataset(AntibodyDataset):
"""
Loader for Harvey nanobody dataset.
This class provides an interface to LOAD preprocessed Harvey dataset files.
It does NOT run the preprocessing pipeline - use preprocessing/harvey/step2_extract_fragments.py for that.
The Harvey dataset contains VHH sequences (heavy chain only, no light chain) from
a high-throughput polyreactivity screen. Sequences are provided with IMGT
numbering and pre-extracted CDR regions.
"""
def __init__(
self, output_dir: Path | None = None, logger: logging.Logger | None = None
):
"""
Initialize Harvey dataset loader.
Args:
output_dir: Directory containing preprocessed fragment files
logger: Logger instance
"""
super().__init__(
dataset_name="harvey",
output_dir=output_dir or HARVEY_OUTPUT_DIR,
logger=logger,
)
@classmethod
def get_schema(cls) -> pa.DataFrameSchema:
return get_harvey_schema()
def get_fragment_types(self) -> list[str]:
"""
Return nanobody-specific fragment types.
Harvey contains VHH sequences only (no light chain), so we generate
6 fragment types instead of the full 16.
Returns:
List of 6 nanobody fragment types
"""
return self.NANOBODY_FRAGMENTS
def extract_sequence_from_imgt(self, row: pd.Series, imgt_cols: list[str]) -> str:
"""
Extract full sequence from IMGT-numbered position columns.
The Harvey raw data contains columns "1" through "128" representing
IMGT numbering positions. This method concatenates non-gap positions
to reconstruct the full sequence.
Args:
row: DataFrame row with IMGT position columns
imgt_cols: List of column names ['1', '2', ..., '128']
Returns:
Full sequence string with gaps removed
"""
positions = []
for col in imgt_cols:
if col in row and pd.notna(row[col]) and row[col] != "-":
positions.append(row[col])
return "".join(positions)
def load_data(
self,
high_csv_path: str | Path | None = None,
low_csv_path: str | Path | None = None,
**_: Any,
) -> pd.DataFrame:
"""
Load Harvey dataset from high/low polyreactivity CSV files.
Args:
high_csv_path: Path to high_polyreactivity_high_throughput.csv
low_csv_path: Path to low_polyreactivity_high_throughput.csv
Returns:
DataFrame with columns: id, VH_sequence, label
Raises:
FileNotFoundError: If input CSV files not found
"""
# Default paths
if high_csv_path is None:
high_csv_path = HARVEY_HIGH_POLY_CSV
if low_csv_path is None:
low_csv_path = HARVEY_LOW_POLY_CSV
# Validate paths
high_csv = Path(high_csv_path)
low_csv = Path(low_csv_path)
if not high_csv.exists():
raise FileNotFoundError(
f"High polyreactivity CSV not found: {high_csv}\n"
f"Please ensure raw files are in data/test/harvey/raw/"
)
if not low_csv.exists():
raise FileNotFoundError(
f"Low polyreactivity CSV not found: {low_csv}\n"
f"Please ensure raw files are in data/test/harvey/raw/"
)
# Load datasets
self.logger.info(f"Reading high polyreactivity data from {high_csv}...")
df_high = pd.read_csv(high_csv)
if len(df_high) == 0:
raise ValueError(
f"Loaded dataset is empty: {high_csv}\n"
"The CSV file may be corrupted or truncated. "
"Please check the file or re-run preprocessing."
)
self.logger.info(f" Loaded {len(df_high)} high polyreactivity sequences")
self.logger.info(f"Reading low polyreactivity data from {low_csv}...")
df_low = pd.read_csv(low_csv)
if len(df_low) == 0:
raise ValueError(
f"Loaded dataset is empty: {low_csv}\n"
"The CSV file may be corrupted or truncated. "
"Please check the file or re-run preprocessing."
)
self.logger.info(f" Loaded {len(df_low)} low polyreactivity sequences")
# IMGT position columns (1-128)
imgt_cols = [str(i) for i in range(1, 129)]
# Extract full sequences from IMGT positions
self.logger.info("Extracting sequences from IMGT positions...")
df_high["VH_sequence"] = df_high.apply(
lambda row: self.extract_sequence_from_imgt(row, imgt_cols), axis=1
)
df_low["VH_sequence"] = df_low.apply(
lambda row: self.extract_sequence_from_imgt(row, imgt_cols), axis=1
)
# Add binary labels
df_high["label"] = 1 # high polyreactivity = non-specific
df_low["label"] = 0 # low polyreactivity = specific
# Combine datasets
self.logger.info("Combining high and low polyreactivity datasets...")
df_combined = pd.concat([df_high, df_low], ignore_index=True)
# Create sequence IDs
df_combined["id"] = [f"harvey_{i:06d}" for i in range(len(df_combined))]
# Select standardized columns
df_output = df_combined[["id", "VH_sequence", "label"]].copy()
# Filter out empty sequences
empty_mask = df_output["VH_sequence"].str.len() == 0
if empty_mask.any():
n_empty = empty_mask.sum()
self.logger.warning(f"Removing {n_empty} sequences with zero length")
df_output = df_output[~empty_mask].reset_index(drop=True)
# Create 'sequence' column for schema validation (use VH)
if "sequence" not in df_output.columns and "VH_sequence" in df_output.columns:
df_output["sequence"] = df_output["VH_sequence"]
# Validate with Pandera
df_output = self.validate_dataframe(df_output)
self.logger.info(f"Combined dataset: {len(df_output)} sequences")
self.logger.info(
f" High polyreactivity (label=1): {(df_output['label'] == 1).sum()}"
)
self.logger.info(
f" Low polyreactivity (label=0): {(df_output['label'] == 0).sum()}"
)
# Sequence length stats
seq_lengths = df_output["VH_sequence"].str.len()
self.logger.info(
f"Sequence length range: {seq_lengths.min()}-{seq_lengths.max()} aa "
f"(mean: {seq_lengths.mean():.1f})"
)
return df_output
|
Functions
get_fragment_types()
Return nanobody-specific fragment types.
Harvey contains VHH sequences only (no light chain), so we generate
6 fragment types instead of the full 16.
Returns:
| Type |
Description |
list[str]
|
List of 6 nanobody fragment types
|
Source code in src/antibody_training_esm/datasets/harvey.py
| def get_fragment_types(self) -> list[str]:
"""
Return nanobody-specific fragment types.
Harvey contains VHH sequences only (no light chain), so we generate
6 fragment types instead of the full 16.
Returns:
List of 6 nanobody fragment types
"""
return self.NANOBODY_FRAGMENTS
|
extract_sequence_from_imgt(row, imgt_cols)
Extract full sequence from IMGT-numbered position columns.
The Harvey raw data contains columns "1" through "128" representing
IMGT numbering positions. This method concatenates non-gap positions
to reconstruct the full sequence.
Parameters:
| Name |
Type |
Description |
Default |
row
|
Series
|
DataFrame row with IMGT position columns
|
required
|
imgt_cols
|
list[str]
|
List of column names ['1', '2', ..., '128']
|
required
|
Returns:
| Type |
Description |
str
|
Full sequence string with gaps removed
|
Source code in src/antibody_training_esm/datasets/harvey.py
| def extract_sequence_from_imgt(self, row: pd.Series, imgt_cols: list[str]) -> str:
"""
Extract full sequence from IMGT-numbered position columns.
The Harvey raw data contains columns "1" through "128" representing
IMGT numbering positions. This method concatenates non-gap positions
to reconstruct the full sequence.
Args:
row: DataFrame row with IMGT position columns
imgt_cols: List of column names ['1', '2', ..., '128']
Returns:
Full sequence string with gaps removed
"""
positions = []
for col in imgt_cols:
if col in row and pd.notna(row[col]) and row[col] != "-":
positions.append(row[col])
return "".join(positions)
|
load_data(high_csv_path=None, low_csv_path=None, **_)
Load Harvey dataset from high/low polyreactivity CSV files.
Parameters:
| Name |
Type |
Description |
Default |
high_csv_path
|
str | Path | None
|
Path to high_polyreactivity_high_throughput.csv
|
None
|
low_csv_path
|
str | Path | None
|
Path to low_polyreactivity_high_throughput.csv
|
None
|
Returns:
| Type |
Description |
DataFrame
|
DataFrame with columns: id, VH_sequence, label
|
Raises:
| Type |
Description |
FileNotFoundError
|
If input CSV files not found
|
Source code in src/antibody_training_esm/datasets/harvey.py
| def load_data(
self,
high_csv_path: str | Path | None = None,
low_csv_path: str | Path | None = None,
**_: Any,
) -> pd.DataFrame:
"""
Load Harvey dataset from high/low polyreactivity CSV files.
Args:
high_csv_path: Path to high_polyreactivity_high_throughput.csv
low_csv_path: Path to low_polyreactivity_high_throughput.csv
Returns:
DataFrame with columns: id, VH_sequence, label
Raises:
FileNotFoundError: If input CSV files not found
"""
# Default paths
if high_csv_path is None:
high_csv_path = HARVEY_HIGH_POLY_CSV
if low_csv_path is None:
low_csv_path = HARVEY_LOW_POLY_CSV
# Validate paths
high_csv = Path(high_csv_path)
low_csv = Path(low_csv_path)
if not high_csv.exists():
raise FileNotFoundError(
f"High polyreactivity CSV not found: {high_csv}\n"
f"Please ensure raw files are in data/test/harvey/raw/"
)
if not low_csv.exists():
raise FileNotFoundError(
f"Low polyreactivity CSV not found: {low_csv}\n"
f"Please ensure raw files are in data/test/harvey/raw/"
)
# Load datasets
self.logger.info(f"Reading high polyreactivity data from {high_csv}...")
df_high = pd.read_csv(high_csv)
if len(df_high) == 0:
raise ValueError(
f"Loaded dataset is empty: {high_csv}\n"
"The CSV file may be corrupted or truncated. "
"Please check the file or re-run preprocessing."
)
self.logger.info(f" Loaded {len(df_high)} high polyreactivity sequences")
self.logger.info(f"Reading low polyreactivity data from {low_csv}...")
df_low = pd.read_csv(low_csv)
if len(df_low) == 0:
raise ValueError(
f"Loaded dataset is empty: {low_csv}\n"
"The CSV file may be corrupted or truncated. "
"Please check the file or re-run preprocessing."
)
self.logger.info(f" Loaded {len(df_low)} low polyreactivity sequences")
# IMGT position columns (1-128)
imgt_cols = [str(i) for i in range(1, 129)]
# Extract full sequences from IMGT positions
self.logger.info("Extracting sequences from IMGT positions...")
df_high["VH_sequence"] = df_high.apply(
lambda row: self.extract_sequence_from_imgt(row, imgt_cols), axis=1
)
df_low["VH_sequence"] = df_low.apply(
lambda row: self.extract_sequence_from_imgt(row, imgt_cols), axis=1
)
# Add binary labels
df_high["label"] = 1 # high polyreactivity = non-specific
df_low["label"] = 0 # low polyreactivity = specific
# Combine datasets
self.logger.info("Combining high and low polyreactivity datasets...")
df_combined = pd.concat([df_high, df_low], ignore_index=True)
# Create sequence IDs
df_combined["id"] = [f"harvey_{i:06d}" for i in range(len(df_combined))]
# Select standardized columns
df_output = df_combined[["id", "VH_sequence", "label"]].copy()
# Filter out empty sequences
empty_mask = df_output["VH_sequence"].str.len() == 0
if empty_mask.any():
n_empty = empty_mask.sum()
self.logger.warning(f"Removing {n_empty} sequences with zero length")
df_output = df_output[~empty_mask].reset_index(drop=True)
# Create 'sequence' column for schema validation (use VH)
if "sequence" not in df_output.columns and "VH_sequence" in df_output.columns:
df_output["sequence"] = df_output["VH_sequence"]
# Validate with Pandera
df_output = self.validate_dataframe(df_output)
self.logger.info(f"Combined dataset: {len(df_output)} sequences")
self.logger.info(
f" High polyreactivity (label=1): {(df_output['label'] == 1).sum()}"
)
self.logger.info(
f" Low polyreactivity (label=0): {(df_output['label'] == 0).sum()}"
)
# Sequence length stats
seq_lengths = df_output["VH_sequence"].str.len()
self.logger.info(
f"Sequence length range: {seq_lengths.min()}-{seq_lengths.max()} aa "
f"(mean: {seq_lengths.mean():.1f})"
)
return df_output
|
Functions
load_harvey_data(high_csv=None, low_csv=None)
Convenience function to load preprocessed Harvey dataset.
IMPORTANT: This loads PREPROCESSED data. To preprocess raw data, use:
preprocessing/harvey/step2_extract_fragments.py
Parameters:
| Name |
Type |
Description |
Default |
high_csv
|
str | None
|
Path to high polyreactivity CSV
|
None
|
low_csv
|
str | None
|
Path to low polyreactivity CSV
|
None
|
Returns:
| Type |
Description |
DataFrame
|
DataFrame with preprocessed data
|
Example
from antibody_training_esm.datasets.harvey import load_harvey_data
df = load_harvey_data()
print(f"Loaded {len(df)} sequences")
Source code in src/antibody_training_esm/datasets/harvey.py
| def load_harvey_data(
high_csv: str | None = None,
low_csv: str | None = None,
) -> pd.DataFrame:
"""
Convenience function to load preprocessed Harvey dataset.
IMPORTANT: This loads PREPROCESSED data. To preprocess raw data, use:
preprocessing/harvey/step2_extract_fragments.py
Args:
high_csv: Path to high polyreactivity CSV
low_csv: Path to low polyreactivity CSV
Returns:
DataFrame with preprocessed data
Example:
>>> from antibody_training_esm.datasets.harvey import load_harvey_data
>>> df = load_harvey_data()
>>> print(f"Loaded {len(df)} sequences")
"""
dataset = HarveyDataset()
return dataset.load_data(high_csv_path=high_csv, low_csv_path=low_csv)
|