---
title: "Importing legacy PhIP-Seq data (convert_legacy)"
output: rmarkdown::html_vignette
vignette: >
  %\VignetteIndexEntry{Importing legacy PhIP-Seq data (convert_legacy)}
  %\VignetteEngine{knitr::rmarkdown}
  %\VignetteEncoding{UTF-8}
---

```{r, include = FALSE}
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)
```

```{r setup}
library(phiperio)
library(dplyr)
```

# What this covers

`convert_legacy()` ingests the classic three-file PhIP-Seq input
(exist/fold_change/raw counts) plus sample metadata (and optional timepoints).
This vignette shows compact cross-sectional and longitudinal examples.

# Cross-sectional: one sample per subject

We create minimal CSVs in a temp dir: `exist`, `samples`, and raw counts.

```{r}
tmp <- withr::local_tempdir()

# exist matrix: peptide x sample
exist_path <- file.path(tmp, "exist.csv")
write.csv(data.frame(
  peptide_id = c("p1", "p2"),
  s1 = c(1, 0),
  s2 = c(0, 1)
), exist_path, row.names = FALSE)

# raw counts (input/hit)
input_path <- file.path(tmp, "counts_input.csv")
write.csv(data.frame(
  peptide_id = c("p1", "p2"),
  s1 = c(100, 80),
  s2 = c(90, 120)
), input_path, row.names = FALSE)

hit_path <- file.path(tmp, "counts_hit.csv")
write.csv(data.frame(
  peptide_id = c("p1", "p2"),
  s1 = c(5, 0),
  s2 = c(0, 7)
), hit_path, row.names = FALSE)

# sample metadata (cross-sectional: sample_id == subject_id)
samples_path <- file.path(tmp, "samples.csv")
write.csv(data.frame(
  sample_id = c("s1", "s2"),
  age       = c(34, 58),
  sex       = c("F", "M")
), samples_path, row.names = FALSE)

pd_xc <- convert_legacy(
  exist_file       = exist_path,
  input_file       = input_path,
  hit_file         = hit_path,
  samples_file     = samples_path,
  extra_cols       = c("age", "sex"),
  peptide_library  = FALSE,
  materialise_table = FALSE
)

get_counts(pd_xc) |> arrange(sample_id, peptide_id) |> collect()
```

# Longitudinal: multiple samples per subject

Add a timepoints map so the same subject_id has multiple sample_ids.

```{r}
# reuse exist/raw counts shapes but rename columns to match sample_ids
exist_lg_path <- file.path(tmp, "exist_long.csv")
write.csv(data.frame(
  peptide_id = c("p1", "p2"),
  s1_t1 = c(1, 0),
  s1_t2 = c(1, 0),
  s2_t1 = c(0, 1),
  s2_t2 = c(0, 1)
), exist_lg_path, row.names = FALSE)

input_lg_path <- file.path(tmp, "counts_input_long.csv")
write.csv(data.frame(
  peptide_id = c("p1", "p2"),
  s1_t1 = c(100, 80),
  s1_t2 = c(110, 90),
  s2_t1 = c(95, 130),
  s2_t2 = c(90, 125)
), input_lg_path, row.names = FALSE)

hit_lg_path <- file.path(tmp, "counts_hit_long.csv")
write.csv(data.frame(
  peptide_id = c("p1", "p2"),
  s1_t1 = c(6, 0),
  s1_t2 = c(7, 0),
  s2_t1 = c(0, 8),
  s2_t2 = c(0, 9)
), hit_lg_path, row.names = FALSE)

samples_lg_path <- file.path(tmp, "samples_long.csv")
write.csv(data.frame(
  sample_id = c("s1_t1", "s1_t2", "s2_t1", "s2_t2"),
  subject_id = c("subj1", "subj1", "subj2", "subj2"),
  timepoint  = c("T1", "T2", "T1", "T2"),
  age       = c(34, 34, 58, 58),
  sex       = c("F", "F", "M", "M")
), samples_lg_path, row.names = FALSE)

pd_lg <- convert_legacy(
  exist_file       = exist_lg_path,
  input_file       = input_lg_path,
  hit_file         = hit_lg_path,
  samples_file     = samples_lg_path,
  timepoints_file  = NULL,  # subject_id/timepoint already in samples metadata
  extra_cols       = c("subject_id", "timepoint", "age", "sex"),
  peptide_library  = FALSE,
  materialise_table = FALSE
)

get_counts(pd_lg) |>
  distinct(subject_id, sample_id, timepoint, peptide_id, exist, input_count, hit_count) |>
  arrange(subject_id, timepoint, peptide_id) |>
  collect()
```

# Key points

- Cross-sectional: `sample_id == subject_id`; no timepoints file needed.
- Longitudinal: provide `timepoints_file` so samples map to subjects and visits.
- `convert_legacy()` accepts CSV or Parquet for each matrix.
- Keep columns consistent across files (same peptide_id set, matching sample_id columns).
- Use `peptide_library = TRUE` to attach annotations (skip here for speed).