Investigating the Sensitivity of Pooled Swab Sampling for Pathogen Early Detection

Simon Grimm; Will Bradshaw

Authors Simon Grimm & Will Bradshaw

Date July 1, 2024

Code

import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
import statsmodels.formula.api as smf
from IPython.display import Markdown
from tabulate import tabulate
from typing import List
from scipy.stats import gmean, linregress, norm
from collections import namedtuple
from collections import defaultdict

DEFAULT_SWAB_SAMPLE_SIZES = [100, 200, 400]
SMALL_SWAB_SAMPLE_SIZES = [50, 75]
ASYMPTOMATIC_SHARE = 0.35
TARGET_INCIDENCE_W = 0.01
DOUBLING_PERIOD_D = 3

DEBUG = False

WW_STUDY_TITLES = ["Rothman et al. 2021", "Crits-Christoph et al. 2021"]


def get_num_and_exp(x):
    scientific_x = f"{x:.1e}"
    number, exponent = scientific_x.split("e")[0], int(scientific_x.split("e")[1])
    return number, exponent


def logit(x):
    return np.log(x / (1 - x))


def logistic(x):
    return 1 / (1 + np.exp(-x))


def get_studies():
    df_op_lu = pd.read_csv(
        "../data/2024-06-17-swab-sensitivity/lu_op_ct_mgs.tsv", sep="\t", skiprows=1
    )  # Data obtained from Table S1.
    df_op_lu.rename(
        columns={"SCV-2 Relative Abundance": "scv2_ra", "Ct value": "scv2_ct"},
        inplace=True,
    )
    df_op_lu[["patient_status", "swab_type", "Study"]] = [
        "Inpatient",
        "op",
        "Lu et al. 2021",
    ]

    df_np_rodriguez = pd.read_csv(
        "../data/2024-06-17-swab-sensitivity/rodriguez_np_ct_mgs.csv", sep=";"
    )  # Data sent to us by authors.
    rodriguez_patient_status_dict = {
        "Hospit": "Inpatient",
        "Out_Patient": "Outpatient",
        "Intensive_Care": "ICU",
    }
    df_np_rodriguez["patient_status"] = df_np_rodriguez["Group"].replace(
        rodriguez_patient_status_dict
    )
    df_np_rodriguez["scv2_ra"] = (
        df_np_rodriguez["Reads_2019_CoV"] / df_np_rodriguez["Reads_Total"]
    )

    df_np_rodriguez.rename(columns={"CoV_Ct_number": "scv2_ct"}, inplace=True)
    df_np_rodriguez[["swab_type", "Study"]] = ["np", "Rodriguez et al. 2021"]

    df_np_babiker = pd.read_csv(
        "../data/2024-06-17-swab-sensitivity/babiker_np_ct_mgs.tsv",
        sep="\t",
        skiprows=1,
    )  # Data obtained from table S2
    df_np_babiker.rename(
        columns={
            "SARS-CoV-2 RT-PCR Ct": "scv2_ct",
            "SARS-CoV-2 RA": "scv2_ra",
            "Inpatient/ED vs. Outpatient": "patient_status",
        },
        inplace=True,
    )
    df_np_babiker["scv2_ct"] = (
        df_np_babiker["scv2_ct"].replace(",", ".", regex=True).astype(float)
    )
    df_np_babiker["patient_status"] = df_np_babiker["patient_status"].apply(
        lambda x: x if x in ["Inpatient", "Outpatient"] else "Unknown"
    )
    # The data uses . to represent missing data. Set this column to integers, while at the same time mapping missing data to NA.
    df_np_babiker["days_from_onset"] = (
        df_np_babiker["Day of Testing Relative to Symptom Onset"]
        .replace(".", "-1")
        .astype(int)
        .replace(-1, "NA")
    )
    df_np_babiker[["swab_type", "Study"]] = ["np", "Babiker et al. 2020"]

    df_np_mostafa = pd.read_csv(
        "../data/2024-06-17-swab-sensitivity/mostafa-np-ra-ct.tsv", sep="\t"
    )  # Data obtained from Table S2.
    mostafa_severity_dict = {
        1: "Required\nventilator",
        2: "ICU",
        3: "Inpatient",
        4: "Outpatient",
        0: "Unknown",
    }
    df_np_mostafa.rename(
        columns={
            "SARS-CoV-2 RT-PCR Ct value": "scv2_ct",
            "CosmosID Proportion Mapped to SARS-CoV-2": "scv2_ra",
        },
        inplace=True,
    )
    df_np_mostafa["Severity index"] = df_np_mostafa["Severity index"].replace("–", 0)
    df_np_mostafa["patient_status"] = (
        df_np_mostafa["Severity index"].astype(int).replace(mostafa_severity_dict)
    )
    # There is no information of why some patients only have "<7" as their days from onset. We set it to 3.5 (the average of 1-6 days.)
    df_np_mostafa["days_from_onset"] = df_np_mostafa["No. of days from onset"].replace(
        {"–": "NA", "<7": "3.5"}
    )
    # Drop samples unless we have both qPCR and MGS detection
    df_np_mostafa = df_np_mostafa[df_np_mostafa["COVID-19-positive"] == True]
    df_np_mostafa = df_np_mostafa[df_np_mostafa["scv2_ct"] != "–"]
    df_np_mostafa["scv2_ct"] = df_np_mostafa["scv2_ct"].astype(float)
    df_np_mostafa[["swab_type", "Study"]] = ["np", "Mostafa et al. 2020"]

    study_dfs = {
        "Lu et al. 2021": df_op_lu,
        "Babiker et al. 2020": df_np_babiker,
        "Mostafa et al. 2020": df_np_mostafa,
        "Rodriguez et al. 2021": df_np_rodriguez,
    }
    return study_dfs


def get_study_ras():
    studies = get_studies()
    study_ras = {}
    for title, df in studies.items():
        study_ras[title] = df["scv2_ra"].tolist()
    return study_ras


def get_composite_ras():
    study_ras = get_study_ras().values()
    composite_swab_ras = sum(study_ras, [])
    if DEBUG:
        print(len(composite_swab_ras))
    return composite_swab_ras


def get_adjusted_composite_ras():
    study_dfs = get_studies().values()
    composite_df = pd.concat(study_dfs)
    composite_df = composite_df[
        composite_df["patient_status"].isin(["Inpatient", "Outpatient"])
    ]
    zero_ras = composite_df[composite_df["scv2_ra"] == 0]["scv2_ra"].tolist()
    symptom_status_dfs = adjust_cts(composite_df)
    symptom_status_ras = defaultdict(list)
    for symptom_status in ["Asymptomatic", "Symptomatic"]:
        df = symptom_status_dfs[symptom_status]
        df = df[df["scv2_ra"] != 0]
        df = adjust_rel_abun(df)
        ras = df["adjusted_scv2_ra"].tolist() + zero_ras
        symptom_status_ras[symptom_status] = ras

    return symptom_status_ras


def get_study_p2ra(swab_sample_sizes=DEFAULT_SWAB_SAMPLE_SIZES):
    study_ras = get_study_ras().values()
    study_titles = get_study_ras().keys()

    study_p2ra_df = get_swab_p2ra(study_titles, study_ras, swab_sample_sizes)
    return study_p2ra_df


def get_composite_p2ra(swab_sample_sizes=DEFAULT_SWAB_SAMPLE_SIZES):
    composite_ras = get_composite_ras()
    composite_p2ra_df = get_swab_p2ra(
        ["Original Composite Data"], [composite_ras], swab_sample_sizes
    )
    return composite_p2ra_df


def get_adjusted_composite_p2ra(swab_sample_sizes=DEFAULT_SWAB_SAMPLE_SIZES):
    symptom_status_ras = get_adjusted_composite_ras()

    asymptomatic_p2ra_df = get_swab_p2ra(
        ["Adjusted Composite Data"],
        [symptom_status_ras["Asymptomatic"]],
        swab_sample_sizes,
    )
    symptomatic_p2ra_df = get_swab_p2ra(
        ["Adjusted Composite Data"],
        [symptom_status_ras["Symptomatic"]],
        swab_sample_sizes,
    )

    asymptomatics = asymptomatic_p2ra_df.sample(
        frac=ASYMPTOMATIC_SHARE, random_state=42
    )
    symptomatics = symptomatic_p2ra_df.sample(
        frac=1 - ASYMPTOMATIC_SHARE, random_state=42
    )

    composite_adjusted_p2ra_df = pd.concat(
        [asymptomatics, symptomatics], ignore_index=True
    )
    return composite_adjusted_p2ra_df


def get_asymptomatic_factor():
    # https://doi.org/10.1371/journal.pone.0270694
    long_2020_asymptomatic_delta = 0  # "The initial CT values for 37 asymptomatic individuals and 37 symptomatic patients appeared similar" https://www.nature.com/articles/s41591-020-0965-6
    lee_2020_asymptomatic_delta = 0  # "There were no significant differences in CT values between asymptomatic and symptomatic (including presymptomatic) patients." 10.1001/jamainternmed.2020.3862
    yang_2023_asymptomatic_delta = 0.99  # Extracted from supplement figure 4D https://doi.org/10.1016/S2666-5247(23)00139-8

    hall_asymptomatic_ct_median = 29.9
    hall_symptomatic_ct_median = 21.8
    hall_asymptomatic_delta = hall_asymptomatic_ct_median - hall_symptomatic_ct_median
    ASYMPTOMATIC_ADJUSTMENT_FACTOR = (
        hall_asymptomatic_delta
        + long_2020_asymptomatic_delta
        + lee_2020_asymptomatic_delta
        + yang_2023_asymptomatic_delta
    ) / 4

    return ASYMPTOMATIC_ADJUSTMENT_FACTOR


def adjust_cts(df):
    ASYMPTOMATIC_ADJUSTMENT_FACTOR = get_asymptomatic_factor()

    np_data = pd.read_csv(
        "../data/2024-06-17-swab-sensitivity/2024-06-18-np-nasal-ct.tsv",
        sep="\t",
        skiprows=1,
    )
    np_means = np_data.mean()

    NP_ADJUSTMENT_FACTOR = np_means.mean()
    goodall_data = pd.read_csv(
        "../data/2024-06-17-swab-sensitivity/goodall-op-nasal-ct.tsv",
        sep="\t",
        skiprows=2,
        header=None,
    )
    OP_ADJUSTMENT_FACTOR = goodall_data[0].mean()

    df["adjusted_scv2_ct"] = df["scv2_ct"]
    # Subtract the adjustment factors from the CT values (NP_ADJUSTMENT_FACTOR is negative, so it increases the CT values)
    df.loc[df["swab_type"] == "np", "adjusted_scv2_ct"] -= NP_ADJUSTMENT_FACTOR
    df.loc[df["swab_type"] == "op", "adjusted_scv2_ct"] -= OP_ADJUSTMENT_FACTOR
    df_symptomatic = df.copy()
    df_asymptomatic = df.copy()
    df_asymptomatic["adjusted_scv2_ct"] += ASYMPTOMATIC_ADJUSTMENT_FACTOR

    symptom_status_dfs = {
        "Asymptomatic": df_asymptomatic,
        "Symptomatic": df_symptomatic,
    }

    return symptom_status_dfs


def adjust_rel_abun(composite_df):
    composite_df = composite_df.copy()
    composite_df.loc[:, "scv2_ra_logit"] = composite_df["scv2_ra"].apply(logit)

    slope, intercept, r_value, p_value, std_err = linregress(
        composite_df["scv2_ct"], composite_df["scv2_ra_logit"]
    )
    composite_df["adjusted_scv2_ra_logit"] = (
        intercept + slope * composite_df["adjusted_scv2_ct"]
    )
    residuals = composite_df["scv2_ra_logit"] - (
        intercept + slope * composite_df["scv2_ct"]
    )

    sigma_squared = np.var(residuals, ddof=2)

    sigma = np.sqrt(sigma_squared)

    noise = np.random.normal(loc=0, scale=sigma, size=len(composite_df))

    composite_df["adjusted_scv2_ra_logit_with_noise"] = (
        composite_df["adjusted_scv2_ra_logit"] + noise
    )
    composite_df["adjusted_scv2_ra"] = composite_df[
        "adjusted_scv2_ra_logit_with_noise"
    ].apply(logistic)
    return composite_df


def get_swab_p2ra(subset_titles, ra_lists, swab_sample_sizes=DEFAULT_SWAB_SAMPLE_SIZES):
    df_swab = pd.DataFrame()
    for subset, ras in zip(subset_titles, ra_lists):
        samples = get_logit_normal_samples(ras)
        df = simulate_p2ra_many(samples, swab_sample_sizes, n_simulations=10000)
        df["Subset"] = subset
        df_swab = pd.concat(
            [
                df_swab,
                df.melt(
                    id_vars="Subset",
                    value_vars=swab_sample_sizes,
                    var_name="Sample Size",
                    value_name="Relative Abundance",
                ),
            ]
        )

    return df_swab


def get_logit_normal_samples(ras):
    ra_values = np.array(ras)
    zero_share = (ra_values == 0).mean()
    ra_values = ra_values[ra_values != 0]
    logit_ra_values = logit(ra_values)
    mean, std = np.mean(logit_ra_values), np.std(logit_ra_values)
    norm_dist = norm(loc=mean, scale=std)
    logit_samples = norm_dist.rvs(size=int(100000 * (1 - zero_share)))
    samples = logistic(logit_samples)
    samples = np.append(samples, np.zeros(int(100000 * zero_share)))
    np.random.shuffle(samples)
    return samples


def get_prevalence():
    days_in_week = 7
    growth_factor = 2 ** (1 / DOUBLING_PERIOD_D)
    weekly_growth = growth_factor**days_in_week
    weekly_incidence = 0.01
    prevalence = weekly_incidence * (weekly_growth) / (weekly_growth - 1)

    if DEBUG:
        # Validate that calculated prevalence gives correct weekly incidence
        # Prevalence on each past day in the previous week
        past_prevalence = [
            prevalence / growth_factor**x for x in range(days_in_week + 1)
        ]  # +1 to include prevalence on day 0
        # Incidence on each past day in the previous week
        past_incidence = -np.diff(past_prevalence)

        assert np.isclose(sum(past_incidence), weekly_incidence)
    return prevalence


def simulate_p2ra_many(ra_lists, sample_populations, n_simulations) -> pd.DataFrame:
    prevalence = get_prevalence()
    results = defaultdict(list)
    for sample_pop in sample_populations:
        for _ in range(n_simulations):
            results[sample_pop].append(simulate_p2ra(sample_pop, ra_lists, prevalence))
    for key, values in results.items():
        results[key] = sorted(values)
    df = pd.DataFrame(results)
    return df


def simulate_p2ra(sample_pop, ra_lists, prevalence):
    n_sick = np.random.poisson(sample_pop * prevalence)
    if n_sick == 0:
        return 0
    cumulative_ra_sick = 0
    for _ in range(n_sick):
        observation = np.random.choice(ra_lists)
        cumulative_ra_sick += observation
    individual_ra_sick = cumulative_ra_sick / n_sick
    relative_abundance = n_sick / sample_pop * individual_ra_sick
    return relative_abundance


def get_fig_2_dfs():
    df_swab = pd.concat([study_p2ra_df, composite_p2ra_df], ignore_index=True)

    df_ww = pd.DataFrame(
        {
            "Relative Abundance": rothman_p2ra_ras + crits_christoph_p2ra_ras,
            "Subset": ["Rothman et al. 2021"] * len(rothman_p2ra_ras)
            + ["Crits-Christoph et al. 2021"] * len(crits_christoph_p2ra_ras),
        }
    )
    return df_swab, df_ww


def get_fig_7_dfs():
    df_swab = pd.concat([composite_p2ra_df, adjusted_composite_p2ra_df])
    df_swab.reset_index(drop=True, inplace=True)
    df_ww = pd.DataFrame(
        {
            "Relative Abundance": rothman_p2ra_ras + crits_christoph_p2ra_ras,
            "Subset": ["Rothman et al. 2021"] * len(rothman_p2ra_ras)
            + ["Crits-Christoph et al. 2021"] * len(crits_christoph_p2ra_ras),
        }
    )

    return df_swab, df_ww


def get_ww_p2ra():
    df_wastewater = pd.read_csv(
        "../data/2024-06-17-swab-sensitivity/2024-05-07-fits-subset.tsv", sep="\t"
    )  # Original fits are taken from p2ra-manuscript. Based on the v1 pipeline (https://github.com/naobservatory/mgs-pipeline). Subset to be small enough for Github with ../scripts/subset_fits_tsv.py.
    rothman_p2ra_ras = df_wastewater[
        (df_wastewater["study"] == "rothman")
        & (df_wastewater["location"] == "Overall")
        & (df_wastewater["pathogen"] == "sars_cov_2")
    ]["ra_at_1in100"].tolist()
    crits_christoph_p2ra_ras = df_wastewater[
        (df_wastewater["study"] == "crits_christoph")
        & (df_wastewater["location"] == "Overall")
        & (df_wastewater["pathogen"] == "sars_cov_2")
    ]["ra_at_1in100"].tolist()

    ww_p2ra = {
        "Rothman et al. 2021": rothman_p2ra_ras,
        "Crits-Christoph et al. 2021": crits_christoph_p2ra_ras,
    }
    return ww_p2ra


def plot_medians(ax, medians):
    patches = ax.collections
    for patch, median in zip(patches, medians):
        for path in patch.get_paths():
            vertices = path.vertices
            x_mid = median
            closest_x = min(vertices[:, 0], key=lambda x: abs(x - x_mid))
            y_upper = max(vertices[vertices[:, 0] == closest_x, 1])
            y_lower = min(vertices[vertices[:, 0] == closest_x, 1])
            ax.plot([x_mid, x_mid], [y_lower, y_upper], color="white", linewidth=1.5)


# Generate p2ra values one time before figures are created.
study_dfs = get_studies().values()
composite_p2ra_df = get_composite_p2ra()
adjusted_composite_p2ra_df = get_adjusted_composite_p2ra()
study_p2ra_df = get_study_p2ra()
ww_p2ra_dfs = get_ww_p2ra()
rothman_p2ra_ras = ww_p2ra_dfs["Rothman et al. 2021"]
crits_christoph_p2ra_ras = ww_p2ra_dfs["Crits-Christoph et al. 2021"]

ASYMPTOMATIC_ADJUSTMENT_FACTOR = get_asymptomatic_factor()
rounded_asymptomatic_factor = f"{ASYMPTOMATIC_ADJUSTMENT_FACTOR:.2f}"

asymptomatic_share_percentage = f"{int(ASYMPTOMATIC_SHARE * 100)}%"
symptomatic_share_percentage = f"{int((1 - ASYMPTOMATIC_SHARE) * 100)}%"

prevalence = get_prevalence()

Acknowledgements: Thanks to Mike McLaren and Jeff Kaufman for helpful feedback on this report. Jeff Kaufman performed an initial exploratory analysis that prompted the creation of this longer report. We also thank the authors¹ of the qPCR and metagenomic studies who shared their data upon request and answered our questions.

Summary

Respiratory swab sampling is central to infectious disease surveillance, but most swab sampling programs focus on targeted identification of specific pathogens. At the NAO, we’re interested in how such a program would perform for more agnostic detection of novel pathogens.
To evaluate this, we collected published paired qPCR/MGS data from four studies of SARS-CoV-2 patients. Using these datasets, we predict the relative abundance of a SARS-CoV-2-like pathogen in sequencing data from pooled respiratory swabs, collected in public places at 1% weekly incidence. Based on our modeling we find the following:
- A composite of four studies gives a mean and median relative abundance of 3.3 \times 10^-4 and 2.2 \times 10^-6 in a sample of 100 swabs at 1% weekly incidence. Median relative abundance rises to 1.6 \times 10^-5 in a sample of 200 swabs. The data this model is based on was adjusted to reflect differences between study patient groups and the expected population of our proposed sampling program.
- Comparing median relative abundance to municipal wastewater sequencing, pooled swab sequencing is 20-, 160-, and 580-fold more sensitive per-read when collecting and sequencing 100, 200, and 400 swabs, respectively. Improvements are larger at the mean, with all pool sizes showing a 1800-fold improvement over wastewater sequencing.
- Small pools frequently contain no infected individuals (400 swabs ≈ 1% no detect, 200 swabs ≈ 10%, 100 swabs ≈ 32%²). The increased median sensitivity of higher pool sizes is thus mostly mediated through a reduction in zero-abundance results, whereas mean results stay the same.
The sensitivity of pooled swab sequencing appears superior to municipal wastewater.
To further investigate the potential value of swab sampling and sequencing for pathogen early warning, the NAO is now running a pilot surveillance program in the Boston area. In this program, we are collecting, pooling, and sequencing self-administered respiratory samples to search for novel pathogens using genetic engineering detection.

Introduction

Respiratory pathogens must pass through the pharynx, mouth, and nose during transmission and as such, are often found in high concentrations in anterior nasal, oropharyngeal, and nasopharyngeal swabs. As a result, we hypothesized that respiratory viruses show higher relative abundance (the fraction of all sequencing reads assigned to that pathogen) in respiratory swab samples than in other samples we’ve looked at previously, including wastewater and air samples. High relative abundance means less sequencing depth required for detection, which decreases sequencing costs. Combined with the relatively low regulatory hurdles associated with swabbing individuals in non-clinical settings, this would make pooled swab sampling a promising approach for detecting novel respiratory pathogens.

Non-clinical swab sampling programs have precedent. One such program is CDC’s Traveler-Based Genomic Surveillance (TGS) program which collects swabs from international travelers entering the US. CDC TGS has traditionally focused on specific pathogens, detected via quantitative polymerase chain reaction (qPCR) and amplicon sequencing. In this report, we evaluate the performance of a more pathogen-agnostic detection method: metagenomic sequencing (MGS) of respiratory swabs from the general public. To do this, we use existing sequencing data from swabs to estimate the expected relative abundance of a SARS-CoV-2-like pathogen under specific epidemiological assumptions. Specifically, we model a scenario where,

A SARS-CoV-2 like-pathogen is at 1% weekly incidence³, growing exponentially with a doubling time of 3 days (wild-type COVID-19)⁴.
A sampling program collects swabs from the population in public places, like train stations, public parks, or sidewalks.
Samples are pooled and sequenced using untargeted metagenomic sequencing (MGS).

In our modeling, we use data from four different studies that performed MGS of swabs from individuals infected with SARS-CoV-2. Subsequently, we make some simple adjustments to try to account for differences between the individuals in these datasets and those who would be likely to contribute samples in non-medical settings. The results suggest pooled swab sampling and sequencing could be a viable approach for MGS-based early detection of SARS-CoV-2-like novel pathogens, with one to two orders of magnitude better sensitivity than a wastewater-based program.

Estimating sampling sensitivity with SARS-CoV-2 sequencing data

In clinical settings, untargeted MGS is typically only used as a last resort for severe undiagnosed infections. Nevertheless, there are many studies showing that MGS can detect pathogens in swab samples with decent sensitivity, among them SARS-CoV-2, influenza, and other pathogens including RSV, rhinovirus, adenovirus, and metapneumovirus. For the purposes of this analysis, we focus on studies that performed untargeted MGS on COVID-19 patients. We only use studies that performed qPCR alongside MGS, which gives us a mapping between targeted and untargeted quantification metrics that we can use for later adjustments (see Appendix)⁵. We found four such studies for which sufficient data were available:

Babiker et al. 2020 analyzed nasopharyngeal (NP) samples from 10 outpatients and 35 inpatients who were SARS-CoV-2 qPCR-positive⁶. Samples were collected in Atlanta (Georgia, USA); the median (IQR) time between symptom onset and sample collection was 5 (4 to 7) days.
Mostafa et al. 2020, sampled NP swabs from 50 patients, 26 of which tested positive for COVID-19 in both qPCR and MGS. Sampling took place at Johns Hopkins Hospital in Baltimore (Maryland, USA). Out of these 26, 17 have information on days between symptom onset and sampling, with a mean of 5.3 days.
Lu et al. 2021 analyzed oropharyngeal (OP) swab samples from inpatients in Wuhan (China). The paper provides limited information on patient disease severity or other characteristics, stating only that “Respiratory specimens (swabs) [were] collected from patients admitted to various Wuhan health care facilities.” There is no information on the time between symptom onset and the sampling date.
Rodriguez et al. 2021 collected NP swabs from 42 outpatients, 17 inpatients not requiring intensive care and 45 intensive care unit (ICU) patients. All patients were sampled at a hospital in Paris (France). There is no information on the time between symptom onset and the sampling date.

SARS-CoV-2 relative abundance, as reported by the authors of each study, varied widely both within and between studies (Figure 1); in particular, Lu et al. 2021 and Rodriguez et al. 2021 showed substantially higher relative abundances than Babiker and Mostafa. The greatest range of relative abundances is seen in Rodriguez et al. 2021, with values spanning close to 7 orders of magnitude. These differences could arise from differences in swab type, sample processing, disease severity, or other factors; crucially, each study performed its own computational analysis to arrive at SARS-CoV-2 read counts, which might add additional inter-study variability.

Code

study_dfs = get_studies()

fig, axs = plt.subplots(2, 2, figsize=(10, 8))

df_np_babiker = study_dfs["Babiker et al. 2020"]
df_np_rodriguez = study_dfs["Rodriguez et al. 2021"]
df_op_lu = study_dfs["Lu et al. 2021"]
df_np_mostafa = study_dfs["Mostafa et al. 2020"]

if DEBUG:
    for study_df in df_np_babiker, df_np_rodriguez, df_op_lu, df_np_mostafa:
        print(
            study_df["Study"].unique(),
            f"Relative Abundance: {gmean(study_df['scv2_ra'])}",
            f"qPCR CT: {gmean(study_df['scv2_ct'])}",
        )

composite_df = pd.concat([df_op_lu, df_np_babiker, df_np_mostafa, df_np_rodriguez])
viridis_palette = sns.color_palette("viridis", n_colors=composite_df["Study"].nunique())

sns.histplot(
    ax=axs[0, 0],
    data=composite_df,
    x="scv2_ra",
    hue="Study",
    palette=viridis_palette,
    bins=30,
    log_scale=True,
    element="bars",
    linewidth=0,
    multiple="stack",
    legend=False,
)

axs[0, 0].set_title("a", x=-0.08, y=1.0, ha="left", fontsize=10, fontweight="bold")
axs[0, 0].set_ylabel("Count")
axs[0, 0].set_xlabel("Relative Abundance")
axs[0, 0].tick_params(axis="x", which="minor", bottom=False)

sns_default = sns.color_palette(n_colors=composite_df["patient_status"].nunique())
sns.histplot(
    ax=axs[0, 1],
    data=composite_df,
    x="scv2_ra",
    hue="patient_status",
    palette=sns_default,
    bins=30,
    log_scale=True,
    element="bars",
    linewidth=0,
    multiple="stack",
    legend=False,
)
axs[0, 1].set_title("b", x=-0.08, y=1.0, ha="left", fontsize=10, fontweight="bold")
axs[0, 1].set_ylabel("Count")
axs[0, 1].set_xlabel("Relative Abundance")
axs[0, 1].tick_params(axis="x", which="minor", bottom=False)

sns.scatterplot(
    ax=axs[1, 0],
    data=composite_df,
    x="scv2_ra",
    y="scv2_ct",
    hue="Study",
    style="Study",
    palette=viridis_palette,
)
axs[1, 0].set_title("c", x=-0.08, y=1.0, ha="left", fontsize=10, fontweight="bold")
axs[1, 0].set_xlabel("Relative Abundance")
axs[1, 0].set_ylabel("qPCR cycle threshold value")
axs[1, 0].set_xscale("log")

axs[1, 0].tick_params(axis="x", which="minor", bottom=False)

sns.move_legend(
    axs[1, 0],
    "lower center",
    bbox_to_anchor=(0.5, -0.44),
    ncol=2,
    title=None,
    frameon=False,
    fontsize=11,
    markerscale=2,
)

sns.scatterplot(
    ax=axs[1, 1],
    data=composite_df,
    x="scv2_ra",
    y="scv2_ct",
    hue="patient_status",
    style="patient_status",
    palette=sns_default,
)
axs[1, 1].set_title("d", x=-0.08, y=1.0, ha="left", fontsize=10, fontweight="bold")
axs[1, 1].set_xlabel("Relative Abundance")
axs[1, 1].set_ylabel("qPCR cycle threshold value")
axs[1, 1].set_xscale("log")

axs[1, 1].tick_params(axis="x", which="minor", bottom=False)

sns.move_legend(
    axs[1, 1],
    "lower center",
    bbox_to_anchor=(0.5, -0.54),
    ncol=2,
    title=None,
    frameon=False,
    fontsize=11,
    markerscale=1.7,
)

for ax in axs.flat:
    ax.spines["right"].set_visible(False)
    ax.spines["top"].set_visible(False)
    ax.tick_params(axis="y", which="both", left=False, right=False, labelleft=True)
plt.tight_layout()

Figure 1: SARS-CoV-2 qPCR CT⁷ vs SARS-CoV-2 relative abundance **(a, c)** Data across studies **(b,d)** Data across patient types. Zero relative abundance values are not shown but account for 13 samples in Mostafa et al. 2020 (33.3%) and 1 sample in Rodriguez et al. 2021 (0.96%).

These data can be used to estimate the expected relative abundance observed at a particular disease incidence in the population, an analysis we previously performed for wastewater. To generate these estimates, we simulated swabbing individuals from the population at random, with pools of swabs sequenced collectively. The number of sick people in each sample is Poisson-distributed. SARS-CoV-2 relative abundance was calculated as the average across individuals in the pool, with the relative abundance of uninfected individuals assumed to be 0. The relative abundance of infected individuals also had some chance of being 0, based on the number of individuals in the data with a positive qPCR result but no SARS-CoV-2 detected in sequencing data; other infected individuals had an RA drawn from a logit-normal distribution fitted to the data⁸. (see logit distributions in Figure 8).

Figure 2 displays the distribution of relative abundances from a simulation where SARS-CoV-2 incidence is 1% per week, with swabs pooled in batches of 100, 200, or 400⁹. For a pool size of 100 swabs, the median SARS-CoV-2 relative abundance ranges from 2.4 \times 10^–7 (Mostafa et al. 2020) to 2.4 \times 10^-5 (Lu et al. 2021). When combining data from all four studies, the median relative abundance is 2.7 \times 10^-6 for 100 swabs. Median detection sensitivity improves by nearly one order of magnitude when increasing the pool size to 200 swabs, with a relative abundance of 2.0 \times 10^-5. More detailed results can be found in Table 3. Smaller pools frequently contain no SARS-CoV-2 fragments (~32% for pools with 100 swabs, ~10% for 200 swabs, and ~1% for 400 swabs, Figure 9). The number of swabs significantly affects the results: medians for 100 and 200 swab pools are notably lower than the 400 swab median due to zero-abundance pools (see median indicators in Figure 2). Mean results for both 100, 200, and 400 swabs are similar with a relative abundance of 3.4 \times 10^-4.

In comparison to the above results, the equivalent median estimate for SARS-CoV-2 in wastewater is approximately 7.4 \times 10^-8 in Rothman et al. (2021) and 1.4 \times 10^-7 in Crits-Christoph et al. (2021). Averaging across the two wastewater studies, this gives an overall prediction of a 25-, 200, and 670 -fold difference in median SARS-CoV-2 relative abundance between municipal wastewater and community swabbing for 100, 200 and 400 swabs, respectively. Looking at mean relative abundance instead of the median, the improvement in per-read sensitivity across all swab pool sizes (100, 200, and 400) is 2000-fold.

Code

def get_fig_2():
    df_swab, df_ww = get_fig_2_dfs()

    if DEBUG:
        for subset in df_swab["Subset"].unique():
            for swab_sample_size in DEFAULT_SWAB_SAMPLE_SIZES:
                subset_df = df_swab.query(
                    "Subset == @subset and `Sample Size` == @swab_sample_size"
                )
                median = subset_df["Relative Abundance"].median()
                zero_count_fraction = (subset_df["Relative Abundance"] == 0).mean()
                print(
                    f"Subset: {subset}, Sample Size: {swab_sample_size}, Fraction of Zero Counts: {zero_count_fraction}, Median: {median}"
                )

    df_ww["Relative Abundance"] = np.log10(df_ww["Relative Abundance"])
    with np.errstate(divide="ignore"):
        df_swab["Relative Abundance"] = np.log10(df_swab["Relative Abundance"])
    fig, axs = plt.subplots(
        2, 1, figsize=(10, 7.5), gridspec_kw={"height_ratios": [3, 1]}
    )
    fig.subplots_adjust(hspace=0.4)
    colors = sns.color_palette("viridis", len(DEFAULT_SWAB_SAMPLE_SIZES))

    sns.violinplot(
        x="Relative Abundance",
        y="Subset",
        ax=axs[0],
        hue="Sample Size",
        palette=colors,
        data=df_swab,
        linewidth=0,
        bw_adjust=1.5,
        dodge=0.4,
        cut=0.5,
    )

    swab_medians_logged = []

    for subset in df_swab["Subset"].unique():
        for swab_sample_size in DEFAULT_SWAB_SAMPLE_SIZES:
            subset_df = df_swab.query(
                "Subset == @subset and `Sample Size` == @swab_sample_size"
            )
            median_logged = subset_df["Relative Abundance"].median()
            swab_medians_logged.append(median_logged)

    plot_medians(axs[0], swab_medians_logged)

    sns.violinplot(
        x="Relative Abundance",
        y="Subset",
        ax=axs[1],
        color="#aedc31",
        data=df_ww,
        inner=None,
        linewidth=0,
        bw_adjust=1.5,
        width=0.5,
        dodge=0.2,
    )

    ww_medians_logged = []
    ww_medians = []
    for study in df_ww["Subset"].unique():
        median = np.median(df_ww[(df_ww["Subset"] == study)]["Relative Abundance"])

        ww_medians_logged.append(median)
        ww_medians.append(10**median)
    if DEBUG:
        print(ww_medians)
    plot_medians(axs[1], ww_medians_logged)

    axs[0].set_title(
        "a (Swabs)", x=-0.31, y=0.95, fontsize=10, ha="left", fontweight="heavy"
    )
    axs[1].set_title(
        "b (Wastewater)", x=-0.31, y=0.95, fontsize=10, ha="left", fontweight="bold"
    )

    for ax in axs:
        ax.spines["top"].set_visible(False)
        ax.spines["right"].set_visible(False)
        ax.spines["left"].set_visible(False)
        ax.set_ylabel("")
        ax.set_yticks(ax.get_yticks())
        y_labels = [
            (
                label.get_text().rsplit(" ", 1)[0]
                + "\n"
                + label.get_text().rsplit(" ", 1)[1]
                if " " in label.get_text()
                else label.get_text()
            )
            for label in ax.get_yticklabels()
        ]
        ax.set_yticklabels(y_labels)

        ax.tick_params(axis="y", which="both", left=False, right=False, labelleft=True)

    xmin, xmax = -10, 0
    axs[0].set_xlim(xmin, xmax)
    axs[1].set_xlim(xmin, xmax)

    for x in np.arange(math.ceil(xmin), 1, 1):
        axs[0].axvline(
            x=x, color="black", linestyle="--", linewidth=0.3, alpha=0.2, zorder=-1
        )
        axs[1].axvline(
            x=x, color="black", linestyle="--", linewidth=0.3, alpha=0.2, zorder=-1
        )

    current_xticks = axs[0].get_xticks()
    for ax in axs:
        ax.set_xticks(current_xticks)
        ax.set_xticklabels(
            [
                "$10^{{{}}}$".format(int(value)) if value != 0 else "1"
                for value in current_xticks
            ]
        )
    legend = axs[0].legend(
        title="Number of\nswabs",
        loc="center left",
        bbox_to_anchor=(1, 0.5),
        ncol=1,
        frameon=False,
    )
    legend._legend_box.align = "left"

    plt.tight_layout()


get_fig_2()

Figure 2: Expected relative abundance at 1% SARS-CoV-2 incidence for **(a)** pooled respiratory swabs and **(b)** wastewater sequencing (data from Grimm et al. 2024). Violins indicate probability density only over simulations for which relative abundance was >0%; zero-abundance results aren’t plotted but can make up a substantial fraction of results (Figure 9). Medians (indicated by white bars) incorporate these zero-abundance results.

Adjusting for confounders

At first sight, the results presented in Figure 2 seem like an argument in favor of pooled swab sampling over wastewater sampling. However, the patients being sampled in these four studies are not equivalent to the random members of the public we’d expect to sample. Three particularly relevant differences are:

Swab type: the four MGS studies all used either NP swabs (Babiker et al. 2020, Mostafa et al. 2020, and Rodriguez et al. 2021), or OP swabs (Lu et al. 2021). Conversely, we expect a public-targeted sampling program would most likely use anterior nasal (henceforth simply “nasal”) or combined nasal/OP swabs. NP swabs, in particular, are poorly suited to mass sampling, as they are quite uncomfortable and require a trained professional to perform (and are thus unsuited to self-swabbing).
Disease severity: Most patients in the four MGS studies were hospitalized, but a public-targeted sampling program would primarily sample from individuals who were asymptomatic, pre-symptomatic, or only mildly ill.
Disease onset: Most of the inpatients in the four MGS studies were sampled later in the course of their infection, but viral loads for SARS-CoV-2 (and many other diseases) are highest around symptom onset and decline over time. In public-facing sampling, we’d expect to sample infected people closer to the start of their infection than in hospitals.

In Appendix 1 we investigate each of these differences in detail and try to estimate the likely effect on the measured relative abundance at a given disease incidence. In summary we find the following:

NP swab samples are expected to be more sensitive than nasal swabs. To approximate nasal swabs NP swab samples thus get a +1.43 CT adjustment (becoming less sensitive), based on data from five different studies (Figure 4). Conversely, OP swabs are less sensitive than nasal swabs. Based on data from two studies (Figure 5), OP swab samples receive a -0.92 CT adjustment (becoming more sensitive).
Asymptomatics generally show lower viral loads. Based on data from four studies, symptomatic patients who are adjusted to resemble asymptomatics thus receive a +2.27 CT adjustment.
Samples from ICU patients and patients on ventilators are expected to differ from mildly-ill or asymptomatic cases in ways that are hard to just adjust for. We thus drop ICU and ventilator samples from our analysis.
Comparisons of inpatient and outpatient samples are frequently confounded by disease onset. We thus do not adjust for inpatient and outpatient status.
Finally, though we created estimates on how CT values change as the time between disease onset and sampling increases (+0.4 CT units per day), Lu et al. 2021 and Rodriguez et al. 2021 have limited or no data on the time between symptom onset and sampling date. Therefore, we do not adjust for this confounder.

Modeling the sensitivity of a pooled swab sampling program

Based on these findings, we re-estimate the expected SARS-CoV-2 relative abundance at 1% weekly incidence in the following way:

We create a composite dataset by combining samples from all studies, dropping samples from ICU patients, and individuals on ventilators, and those with unknown patient status.
We apply a linear regression fitted to the composite qPCR data and logit-scale relative-abundance data, generating a relationship between qPCR and sequencing results.
We adjust CT values to represent nasal swabs by applying a +1.43 CT adjustment to NP swab samples and a -0.92 CT adjustment to OP samples.
We create separate asymptomatic and symptomatic versions of the dataset, applying a further +2.27 CT to the former while leaving the latter unchanged.
Separately for the asymptomatic and symptomatic distributions, we convert the adjusted CT values into adjusted SARS-CoV-2 relative abundances using the linear regression from step 2. We then fit a logit-normal distribution on the adjusted, non-zero RAs as above. Finally, we add the same relative share of zero RAs to the distribution as was observed in the original relative abundance data (resulting in a zero-inflated logit-normal distribution)
We conduct Monte Carlo simulations of pooled swab samples, in which the overall relative abundance of the pool is the average of each swab, and the relative abundance of each swab in the pool is calculated as one of three states:
1. Non-infected: relative abundance (RA) = 0.
2. Infected, asymptomatic: RA drawn from adjusted RA distribution for asymptomatic cases.
3. Infected, symptomatic: RA drawn from adjusted RA distribution for symptomatic cases¹⁰.

The results are visualized in Figure 3 and summarized in Table 1. Comparing the 100 swab sample size results to the wastewater data, swab samples are 30-fold more sensitive than Rothman et al. 2021 (7.4\times 10^-8) and 15-fold more sensitive than Crits-Christoph et al. 2021 (1.4\times 10^-7). Using swab sample pools of 200, swab sampling sensitivity is 225- and 120-fold more sensitive than Rothman et al. 2021 and Crits-Christoph et al. 2021 respectively. Averaging across the two wastewater studies, this gives an overall prediction of a 20- and 160-fold difference in median SARS-CoV-2 relative abundance between municipal wastewater and community swabbing for 100 and 200 swabs, respectively. Mean results across all pool sizes give a 1800-fold improvement over mean wastewater relative abundance.

Code

def get_fig_7():
    df_swab, df_ww = get_fig_7_dfs()

    with np.errstate(divide="ignore"):
        df_ww["Relative Abundance"] = np.log10(df_ww["Relative Abundance"])
        df_swab["Relative Abundance"] = np.log10(df_swab["Relative Abundance"])

    fig, axs = plt.subplots(
        2,
        1,
        figsize=(10, 5),
        gridspec_kw={"height_ratios": [1.5, 1]},
    )
    fig.subplots_adjust(hspace=0.4)
    colors = sns.color_palette("viridis", 3)
    sns.violinplot(
        x="Relative Abundance",
        y="Subset",
        ax=axs[0],
        hue="Sample Size",
        palette=colors,
        data=df_swab,
        linewidth=0,
        bw_adjust=1.5,
        dodge=0.4,
        cut=1,
    )
    swab_medians_logged = []
    for subset in df_swab["Subset"].unique():
        for swab_sample_size in DEFAULT_SWAB_SAMPLE_SIZES:
            median = np.median(
                df_swab[
                    (df_swab["Subset"] == subset)
                    & (df_swab["Sample Size"] == swab_sample_size)
                ]["Relative Abundance"]
            )
            swab_medians_logged.append(median)

    plot_medians(axs[0], swab_medians_logged)
    sns.violinplot(
        x="Relative Abundance",
        y="Subset",
        ax=axs[1],
        color="#aedc31",
        data=df_ww,
        linewidth=0,
        bw_adjust=1.5,
        width=0.5,
        dodge=0.4,
    )
    ww_medians_logged = []
    ww_medians = []
    for study in df_ww["Subset"].unique():
        median = np.median(df_ww[(df_ww["Subset"] == study)]["Relative Abundance"])

        ww_medians_logged.append(median)
        ww_medians.append(10**median)
    plot_medians(axs[1], ww_medians_logged)

    axs[0].set_title(
        "a (Swabs)", x=-0.31, y=0.95, fontsize=10, ha="left", fontweight="heavy"
    )
    axs[1].set_title(
        "b (Wastewater)", x=-0.31, y=0.95, fontsize=10, ha="left", fontweight="bold"
    )
    for ax in axs:
        ax.spines["top"].set_visible(False)
        ax.spines["right"].set_visible(False)
        ax.spines["left"].set_visible(False)
        ax.set_ylabel("")
        if ax == axs[1]:
            y_labels = [
                (
                    label.get_text().rsplit(" ", 1)[0]
                    + "\n"
                    + label.get_text().rsplit(" ", 1)[1]
                    if " " in label.get_text()
                    else label.get_text()
                )
                for label in ax.get_yticklabels()
            ]
            ax.set_yticks(ax.get_yticks())
            ax.set_yticklabels(y_labels)

        if ax == axs[0]:
            y_labels = [
                (
                    label.get_text().rsplit(" ", 1)[0]
                    + "\n"
                    + label.get_text().rsplit(" ", 1)[1]
                )
                for label in ax.get_yticklabels()
            ]
            ax.set_yticks(ax.get_yticks())
            ax.set_yticklabels(y_labels)
        ax.tick_params(axis="y", which="both", left=False, right=False, labelleft=True)
    xmin, xmax = -10, 0
    for x in np.arange(math.ceil(xmin), 1, 1):
        axs[0].axvline(
            x=x, color="black", linestyle="--", linewidth=0.3, alpha=0.2, zorder=-1
        )
        axs[1].axvline(
            x=x, color="black", linestyle="--", linewidth=0.3, alpha=0.2, zorder=-1
        )
    for ax in axs:
        ax.set_xlim(xmin, xmax)
        current_xticks = ax.get_xticks()
        ax.set_xticks(current_xticks)
        ax.set_xticklabels(
            [
                "$10^{{{}}}$".format(int(value)) if value != 0 else "1"
                for value in current_xticks
            ]
        )
    legend = axs[0].legend(
        title="Number of\nswabs",
        loc="center left",
        bbox_to_anchor=(1, 0.5),
        ncol=1,
        frameon=False,
    )
    legend._legend_box.align = "left"
    plt.tight_layout()


get_fig_7()

Figure 3: Expected relative abundance at 1% SARS-CoV-2 incidence for **(a)** pooled respiratory swabs and **(b)** wastewater sequencing (data from Grimm et al. 2024).

Sample Type	Sample Size	Median	IQR	Mean	Zero Count Fraction	Fold difference vs Wastewater (median)
Original Composite Data	100	2.7e-06	6.9e-05	0.00038	0.31	25
Original Composite Data	200	2e-05	0.00019	0.00034	0.09	200
Original Composite Data	400	6.8e-05	0.00033	0.00034	0.01	670
Adjusted Composite Data	100	2.2e-06	5.9e-05	0.00033	0.31	20
Adjusted Composite Data	200	1.6e-05	0.00016	0.00031	0.1	160
Adjusted Composite Data	400	5.9e-05	0.00029	0.00032	0.01	580

Table 1: Comparison of swab sampling relative abundance to wastewater relative abundance at 1% weekly incidence (Wastewater relative abundance was averaged across Rothman et al. 2021 and Crits-Christoph et al. 2021)

Conclusion

In this report, we evaluate the expected sensitivity of pooled swab sampling and untargeted metagenomic sequencing for detecting a SARS-CoV-2-like pathogen. By analyzing MGS data from COVID-19 studies and adjusting for swab type and symptom status, we estimated the relative abundance of SARS-CoV-2 reads that could be obtained at a 1% weekly incidence. Our results suggest that this approach may offer one to three orders of magnitude greater sensitivity compared to municipal wastewater, depending on the swab sample pool size and the measure of comparison (mean or median).

This investigation has several obvious limitations. Most glaringly, we only used data on SARS-CoV-2, making extrapolation to other diseases (let alone hypothetical future ones) difficult. The adjustments made to account for confounding factors are fairly simplistic, and the results might change after more detailed (and time-consuming) analysis. Finally, even having made these adjustments, our relative abundance estimates span many orders of magnitude, leaving us with a high degree of uncertainty about the true efficacy of swab sampling for MGS-based detection of a SARS-CoV-2-like pathogen.

Our analysis reveals pool size as a critical variable influencing median results, with larger pools significantly decreasing the likelihood of zero-abundance samples. Moreover, the performance of pooled swab sampling in our simulations is highly sensitive to key epidemiological factors, such as pathogen doubling time, which affects both mean and median results. Internal analyses suggest that a slow-growing pathogen with an extended shedding period would yield higher sensitivity than the figures presented in this study.¹¹

Based on our results, we are cautiously optimistic about the performance of pooled swab sampling for metagenomic biosurveillance, especially compared to municipal wastewater. Averaging across the two wastewater studies considered here gives a fold-difference in median relative abundance (at 1% weekly COVID-19 incidence) of 20 and 160 for the adjusted 100- and 200-swab estimates, and a 2000-fold improvement in mean relative abundance. This increase in relative abundance would bring a concomitant decrease in the sequencing depth required for detection.

More research is required to confirm the viability of such a program in practice. In particular, both improving the performance of wet-lab protocols and streamlining swab sampling will be required to create additional savings. But our results suggest that swab sampling in non-hospital settings could plausibly be a sensitive way to detect novel pathogens. CDC’s TGS program (see above) demonstrates the logistical feasibility of large-scale swab sampling, collecting samples from almost 50,000 individuals in just over a year. Incorporating metagenomic sequencing into such a program in a cost-effective way might help identify and mitigate the spread of novel biological threats.

Appendix 1: Confounders

Swab type adjustment

Tsang et al. 2021 performed a meta-analysis to evaluate how well different swab types detect SARS-CoV-2 using qPCR. Swab types included nasal swabs (n=1622), OP swabs (n=388), and pooled nasal and throat swabs (n=719), each of which were compared with NP swabs¹². Using this approach, pooled nasal/throat swabs have the best diagnostic performance with a sensitivity of 0.97 (0.93-1.00), while nasal swabs performed worse at 0.87 (0.80–0.93)

Sample Type	Sensitivity	Negative Predictive Value
Nasal swabs (n=1622)	0.87 (0.80–0.93)	0.95 (0.88–0.99)
Throat swabs (n=388)	0.75 (0.52–0.92)	0.96 (0.94–0.98)
Pooled nasal/throat swabs (n=719)	0.97 (0.93–1.00)	0.99 (0.98–1.00)
NP swab (n=7973, includes comparison with saliva)	0.94 (0.91–0.97)	0.99 (0.98–1.00)

Table 2: Comparison of different swab sample types. Adapted from Tsang et al. 2021

This study gives us binary information about a pathogen’s presence in different swab types. However, to quantify the difference between swab types, we need quantitative information about the average viral load of SARS-CoV-2 in each swab type. One way to quantify this difference is by calculating the difference in cycle threshold (CT) values obtained when running qPCR on paired swabs sampled from the same person, with a lower CT indicating a higher viral load.

To compare NP to nasal swabs, we examined five studies that performed paired sampling and SARS-CoV-2 qPCR on the same patients using both swab types: Patriquin et al. 2022, McCulloch et al. 2020, Pere et al. 2020, Kojima et al. 2020, and Tu et al. 2020 (Figure 4). The mean differences in CT values (NP – nasal) for the five studies were -3.05, -1.55, 0.97, -3.91 and 0.38, respectively. The unweighted average of these five values is -1.43, indicating that NP swabs are typically more sensitive than nasal swabs.

Code

def get_fig_3():
    df = pd.read_csv(
        "../data/2024-06-17-swab-sensitivity/2024-06-18-np-nasal-ct.tsv",
        sep="\t",
        skiprows=1,
    )

    df = df.melt(var_name="Study", value_name="CT Difference")

    pretty_study_names = {
        "Patriquin2022": "Patriquin et al. 2022",
        "McCulloch2020": "McCulloch et al. 2020",
        "Pere2020": "Pere et al. 2020",
        "Kojima2020": "Kojima et al. 2020",
        "Tu2020": "Tu et al. 2020",
    }

    df["Study"] = df["Study"].map(pretty_study_names).values

    mean_ct_diff = df.groupby("Study", as_index=False)["CT Difference"].mean()
    if DEBUG:
        print(mean_ct_diff)
        print(f"Mean CT Difference: {mean_ct_diff['CT Difference'].mean()}")
    fig = plt.figure(figsize=(10, 3.8))

    colors = sns.color_palette("viridis", len(df["Study"].unique()))

    sns.stripplot(
        data=df,
        y="Study",
        x="CT Difference",
        hue="Study",
        jitter=True,
        zorder=-1,
        palette=colors,
        alpha=0.5,
    )
    sns.pointplot(
        data=mean_ct_diff,
        y="Study",
        x="CT Difference",
        linestyles="none",
        markers="d",
        color="#36454F",
        markersize=7,
        errorbar=None,
        zorder=1,
    )
    plt.legend([], [], frameon=False)
    plt.ylabel("")
    plt.xlabel("SARS-CoV-2 qPCR CT Δ (NP - Nasal)")
    plt.tick_params(axis="y", which="both", left=False, right=False, labelleft=True)

    x_min, x_max = plt.xlim()

    plt.text(x_max / 2, -0.6, "Favors Nasal", fontsize=10, color="black", ha="center")
    plt.text(
        x_min / 2,
        -0.6,
        "Favors Nasopharyngeal",
        fontsize=10,
        color="black",
        ha="center",
    )
    x_min = math.ceil(x_min / 5) * 5
    x_max = math.ceil(x_max / 5) * 5

    x_marks = np.arange(x_min, x_max, 2.5)
    for x in x_marks:
        if x == 0:
            continue
        plt.axvline(
            x=x, color="grey", linestyle="--", alpha=0.5, linewidth=0.5, zorder=-2
        )

    plt.axvline(x=0, color="red", linestyle="--", alpha=0.5, zorder=-2)

    plt.gca().spines["right"].set_visible(False)
    plt.gca().spines["top"].set_visible(False)
    plt.gca().spines["left"].set_visible(False)
    plt.tight_layout()


get_fig_3()

Figure 4: Nasal swabs vs NP swabs. Data from Patriquin et al. 2022, McCulloch et al. 2020, Pere et al. 2020, Kojima et al. 2020, and Tu et al. 2020. Patriquin et al. 2022 and Pere et al. 2020 performed professional collection of both NP and nasal swabs; other studies performed professional collection of NP swabs but had patients collect nasal swabs themselves. Diamonds indicate mean values.

Similarly, to compare OP to nasal swabs, we examined two comparative studies: Berenger et al. 2020 and Goodall et al. 2022. Berenger et al. 2020 only give median, 10th, and 90th percentile values; assuming for the sake of tractability that these are drawn from a normal distribution, we get a difference between OP and nasal swabs of 0.89 (OP = 29.94, Nasal = 29.05). For Goodall, the mean difference is 0.99 (Figure 5). This yields an unweighted average of 0.92, indicating that OP swabs are less sensitive than nasal swabs.

Code

def get_fig_4():
    df = pd.read_csv(
        "../data/2024-06-17-swab-sensitivity/goodall-op-nasal-ct.tsv",
        sep="\t",
        skiprows=1,
    )
    pretty_study_names = {
        "Goodall2022": "Goodall et al. 2022",
    }
    df = df.melt(var_name="Study", value_name="CT Difference")
    df["Study"] = df["Study"].map(pretty_study_names).values
    mean_ct_diff = df.groupby("Study", as_index=False)["CT Difference"].mean()
    if DEBUG:
        print(mean_ct_diff)
        print(f"Mean CT Difference: {mean_ct_diff}")
    fig, ax = plt.subplots(figsize=(10, 2))

    viridis = sns.color_palette("viridis", 1)
    sns.stripplot(
        data=df,
        y="Study",
        x="CT Difference",
        hue="Study",
        jitter=True,
        zorder=-1,
        ax=ax,
        alpha=0.8,
        palette=viridis,
    )
    sns.pointplot(
        data=mean_ct_diff,
        y="Study",
        x="CT Difference",
        linestyles="none",
        markers="d",
        color="#36454F",
        markersize=8,
        errorbar=None,
        zorder=1,
        ax=ax,
    )
    ax.legend([], [], frameon=False)

    ax.set_ylabel("")
    ax.set_xlabel("SARS-CoV-2 qPCR CT Δ (OP - Nasal)")
    ax.tick_params(axis="y", which="both", left=False, right=False, labelleft=True)
    x_min, x_max = ax.get_xlim()

    ax.text(x_max / 2, -0.6, "Favors Nasal", fontsize=10, color="black", ha="center")
    ax.text(
        x_min / 2, -0.6, "Favors Oropharyngeal", fontsize=10, color="black", ha="center"
    )

    x_min = math.ceil(x_min / 5) * 5
    x_max = math.ceil(x_max / 5) * 5

    x_marks = np.arange(x_min, x_max, 2.5)
    for x in x_marks:
        if x == 0:
            continue
        ax.axvline(
            x=x, color="grey", linestyle="--", alpha=0.5, linewidth=0.5, zorder=-2
        )
    ax.axvline(x=0, color="red", linestyle="--", alpha=0.5, zorder=-2)

    ax.axhline(y=0.5, color="grey", linestyle="--", alpha=0.5, linewidth=0.5)

    ax.spines["right"].set_visible(False)
    ax.spines["top"].set_visible(False)
    ax.spines["left"].set_visible(False)
    plt.tight_layout()
    plt.show()


get_fig_4()

# Computing median of Berenger et al. 2020:


# Source: https://www.medrxiv.org/content/10.1101/2020.05.05.20084889v1.full#:~:text=Comparing%20the%20samples,respectively%20(p%3E0.09).
data = {
    "E_NP": {"median": 25.5, "10th": 20.5, "90th": 29.5},
    "E_Nasal": {"median": 27.6, "10th": 24.7, "90th": 32.4},
    "E_Throat": {"median": 28.7, "10th": 23.5, "90th": 34.2},
    "RdRp_NP": {"median": 27.9, "10th": 23.5, "90th": 32.4},
    "RdRp_Nasal": {"median": 30.5, "10th": 27.5, "90th": 35.0},
    "RdRp_Throat": {"median": 31.3, "10th": 26.5, "90th": 35.5},
}


def calculate_mean_std(median, p10, p90):
    mean = median
    std = (p90 - p10) / (2 * norm.ppf(0.9))
    norm_dist = norm(mean, std)
    arrived_p10 = norm_dist.ppf(0.1)
    arrived_p90 = norm_dist.ppf(0.9)
    if abs(p10 - arrived_p10) > 1 or abs(p90 - arrived_p90) > 1:
        print(
            f"Warning: p10 or p90 values are not within one point of the arrived percentiles. p10: {p10}, arrived_p10: {arrived_p10}, p90: {p90}, arrived_p90: {arrived_p90}"
        )
    return mean, std


x = np.linspace(10, 40, 1000)

distributions = []

for key, values in data.items():
    mean, std = calculate_mean_std(values["median"], values["10th"], values["90th"])
    distribution = np.random.normal(mean, std, 100000)

    distributions.append(distribution)

mean_of_means = {}
swab_types = ["NP", "Nasal", "Throat"]

# Not doing a weighted average because there are approximately the
# same number of samples for both qPCR targets.
for swab in swab_types:
    swab_means = [
        np.mean(distribution)
        for key, distribution in zip(data.keys(), distributions)
        if swab in key
    ]
    mean_of_means[swab] = np.mean(swab_means)

Figure 5: Nasal swabs vs Oro-pharyngeal swabs. Data from Goodall et al. 2022. Diamond indicates mean.

Patient status and disease onset adjustment

The four MGS studies we have examined include both inpatients and outpatients, with the latter making up 68 out of 203 individuals (33%) (Figure 6). The remaining study population consists of inpatients with either an unknown level of disease severity, or known ventilator or ICU usage.

Code

def get_fig_5():
    study_dfs = get_studies()

    df_np_babiker = study_dfs["Babiker et al. 2020"]
    df_np_rodriguez = study_dfs["Rodriguez et al. 2021"]
    df_op_lu = study_dfs["Lu et al. 2021"]
    df_np_mostafa = study_dfs["Mostafa et al. 2020"]
    order = ["Outpatient", "Inpatient", "Required\nventilator", "ICU", "Unknown"]

    lu_counts = df_op_lu["patient_status"].value_counts().reindex(order)
    babiker_counts = df_np_babiker["patient_status"].value_counts().reindex(order)
    mostafa_counts = df_np_mostafa["patient_status"].value_counts().reindex(order)
    rodriguez_counts = (
        df_np_rodriguez["patient_status"].value_counts().reindex(order).dropna()
    )

    if DEBUG:
        print(lu_counts, babiker_counts, mostafa_counts, rodriguez_counts)

    fig, axs = plt.subplots(
        1, 4, figsize=(10, 4), width_ratios=[0.7, 1.5, 2.4, 4], sharey=True, dpi=800
    )

    colors = sns.color_palette("viridis", 10)
    color_dict = {
        "Outpatient": colors[9],
        "Inpatient": colors[3],
        "Required\nventilator": colors[5],
        "ICU": colors[0],
        "Unknown": colors[7],
    }

    axs[0].bar(
        lu_counts.index, lu_counts.values, color=color_dict["Inpatient"], width=0.5
    )
    axs[0].set_title("Lu et al. 2021", fontsize=10)
    axs[0].set_ylabel("Count")

    axs[1].bar(
        babiker_counts.index,
        babiker_counts.values,
        color=[color_dict[x] for x in babiker_counts.index],
        width=0.8,
    )
    axs[1].set_title("Babiker et al. 2020", fontsize=10)

    axs[3].bar(
        mostafa_counts.index,
        mostafa_counts.values,
        color=[color_dict[x] for x in mostafa_counts.index],
    )
    axs[3].set_title("Mostafa et al. 2020", fontsize=10)
    axs[3].set_ylim(0, max(mostafa_counts.values) + 10)
    axs[3].set_yticks(np.arange(0, max(mostafa_counts.values) + 30, 10))

    axs[2].bar(
        rodriguez_counts.index,
        rodriguez_counts.values,
        color=[color_dict[x] for x in rodriguez_counts.index],
    )
    axs[2].set_title("Rodriguez et al. 2021", fontsize=10)
    axs[2].set_ylim(0, max(rodriguez_counts.values) + 5)
    axs[2].set_yticks(np.arange(0, max(rodriguez_counts.values) + 10, 5))

    for ax in axs:
        ax.spines["right"].set_visible(False)
        ax.spines["top"].set_visible(False)
        ax.tick_params(axis="y", which="both", left=False, right=False, labelleft=True)
        ax.tick_params(
            axis="x",
            which="both",
            top=False,
            bottom=False,
            labeltop=False,
            labelbottom=True,
            labelsize=9,
        )
        for x in np.arange(0, 50, 10):
            ax.axhline(
                y=x, color="grey", linestyle="--", alpha=0.5, linewidth=0.5, zorder=-1
            )

    plt.tight_layout()
    plt.show()


get_fig_5()

## Computing share of outpatients across studies

study_dfs = get_studies()
df_np_babiker = study_dfs["Babiker et al. 2020"]
df_np_rodriguez = study_dfs["Rodriguez et al. 2021"]
df_op_lu = study_dfs["Lu et al. 2021"]
df_np_mostafa = study_dfs["Mostafa et al. 2020"]

order = ["Outpatient", "Inpatient", "Required\nventilator", "ICU", "Unknown"]

lu_counts = df_op_lu["patient_status"].value_counts().reindex(order)
babiker_counts = df_np_babiker["patient_status"].value_counts().reindex(order)
mostafa_counts = df_np_mostafa["patient_status"].value_counts().reindex(order)
rodriguez_counts = (
    df_np_rodriguez["patient_status"].value_counts().reindex(order).dropna()
)

composite_counts = pd.DataFrame(
    {
        "Lu": lu_counts,
        "Babiker": babiker_counts,
        "Mostafa": mostafa_counts,
        "Rodriguez": rodriguez_counts,
    }
)

total_counts_across_studies = composite_counts.sum(axis=1)
share_outpatients_across_studies = (
    total_counts_across_studies["Outpatient"] / total_counts_across_studies.sum()
)
outpatient_share = f"{share_outpatients_across_studies:.0%}"
if DEBUG:
    print(f"outpatient_share: {outpatient_share}")

Figure 6: Patient composition of Lu et al. 2021, Babiker et al. 2020, Mostafa et al. 2020, and Rodriguez et al. 2021

Greater COVID-19 disease severity is associated with increased viral load; all else equal, we thus expect hospitalized patients to show higher viral abundance than outpatients. However, viral loads are also inversely correlated with time since symptom onset. As it takes time for someone to become hospitalized, inpatients are generally tested later in the course of their disease than outpatients, resulting in lower viral loads. It’s not immediately obvious which of these two effects we expect to dominate.

Perhaps unsurprisingly, given these opposing effects, different studies give inconsistent results when comparing SARS-CoV-2 viral loads between different groups of patients. Compared to inpatients, outpatients showed higher SARS-CoV-2 CT values in Babiker et al. (mean outpatient-inpatient difference of +1.17) but lower CT values in Rodriguez et al. (mean difference of -4.45). We identified two further studies that allowed us to make this comparison: Souverein et al. 2022 compared ~20,000 outpatients to ~300 inpatients and found an average CT difference of +1.3, while Knudtzen et al. 2021 compared 87 outpatients to 82 inpatients and found an average difference of -2.3. An unweighted average of the CT deltas in these four studies finds that outpatients have a mean CT value 1.07 lower than inpatients. However, it’s unclear that this number is very meaningful, given the large differences between studies and confounding effect of disease onset.

One of the four studies used above, Knudtzen et al. 2021, contains information on both patient status and symptom onset, allowing us to disentangle the two effects for this dataset (Figure 7). Fitting a simple linear model on this data, which takes into account patient status, the CT value increase per additional day is 0.37 (+- 0.13), while inpatient/outpatient status has no significant effect.

Code

def get_fig_6():
    df_knudtzen_ct_days = pd.read_csv(
        "../data/2024-06-17-swab-sensitivity/knudtzen2021_ct_days.tsv",
        sep="\t",
        skiprows=1,
    )
    df_knudtzen_ct_days["Patient Status"] = df_knudtzen_ct_days[
        "Patient Status"
    ].astype("category")
    df_knudtzen_ct_days["Day"] = df_knudtzen_ct_days["Day"].round()
    status_to_color = {
        status: idx
        for idx, status in enumerate(df_knudtzen_ct_days["Patient Status"].unique())
    }
    colors = sns.color_palette("viridis", len(status_to_color))

    reg = smf.ols('CT ~ Day + Q("Patient Status")', data=df_knudtzen_ct_days)
    results = reg.fit()
    if DEBUG:
        print(results.summary())

    fig, ax = plt.subplots(figsize=(10, 4))
    for status, color in zip(df_knudtzen_ct_days["Patient Status"].unique(), colors):
        subset = df_knudtzen_ct_days[df_knudtzen_ct_days["Patient Status"] == status]
        plt.scatter(
            subset["Day"],
            subset["CT"],
            c=[color],
            label=status,
            edgecolors="w",
            linewidths=0.5,
        )

        line_x_values = range(int(subset["Day"].min()), int(subset["Day"].max()) + 1)
        line_y_values = results.predict(
            pd.DataFrame(
                {"Day": line_x_values, "Patient Status": [status] * len(line_x_values)}
            )
        )

        plt.plot(line_x_values, line_y_values, color=color, linestyle="--", linewidth=2)

    plt.xlabel("Days from symptom onset")
    plt.ylabel("qPCR-CT Value")
    plt.legend(
        title="", loc="upper center", bbox_to_anchor=(0.5, -0.15), ncol=2, frameon=False
    )

    for val in range(0, 20, 5):
        ax.axvline(
            x=val, color="grey", linestyle="--", linewidth=0.5, alpha=0.3, zorder=-1
        )

    for val in range(15, 40, 5):
        ax.axhline(
            y=val, color="grey", linestyle="--", linewidth=0.5, alpha=0.3, zorder=-1
        )

    ax.set_xticks(range(0, int(df_knudtzen_ct_days["Day"].max()) + 1, 5))

    ax.spines["right"].set_visible(False)
    ax.spines["top"].set_visible(False)

    plt.tight_layout()
    plt.show()


get_fig_6()

Figure 7: Relationship between days since symptom onset and viral load. Data extracted from Knudtzen et al. 2021, Figure 3A.

A second relevant study is He et al. 2020, who collected 414 OP swabs sampled from 94 inpatients in Guangzhou, China, none of whom were critically ill. Samples were collected longitudinally for some of these patients, with time from symptom onset ranging from 0 to 32 days. Average CT values upon symptom onset were around 31 CT values, rising to 35 at 3-4 days and reaching the limit of detection (CT=40) after 21 days. Treating the change in CT units as linear, this gives an average increase of 0.43 CT units per day; similar to the result from Knudtzen et al. Integrating the results from both of these studies, we estimate that each passing day between symptom onset and swab sampling leads to an increase of roughly 0.4 CT units.

Asymptomatic individual adjustment

Most individuals sampled in the four MGS studies were symptomatic patients, but we expect most infected people who would be captured by a community swabbing program to be asymptomatic, presymptomatic, or only mildly ill. About one third of COVID-19 cases are persistently asymptomatic (Sah et al. 2021); as such, we need to account for how they differ from symptomatic cases.

To investigate this, we analyzed four studies that compared qPCR results of asymptomatic COVID cases to those that were either symptomatic at the time of testing or developed symptoms later:

Hall et al. 2022 analyzed the results of Boston University’s testing program, which tested all individuals who came to campus, and logged symptoms upon taking the test. Comparing the median of asymptomatic individuals (n=319) with that of symptomatic (n=512) shows asymptomatics to have a CT that is 8.1 points higher (29.9 - 21.8).
Long et al. 2020, taking place in Wanzhou District, China, compared the first positive SARS-CoV-2 qPCR result of 37 quarantined, persistently asymptomatic individuals to 37 symptomatic patients with similar characteristics (sex, age, and comorbidities) and found no clear difference in median CT values between the two groups.
Lee et al. 2022 studied patients in a South Korean isolation facility. They do not provide raw data or median values, but report there was no qPCR CT value difference between symptomatic (n=214) and asymptomatic patients (n=89).
Yang et al. 2023 performed a prospective cohort study of patients admitted to a Shenzhen hospital with lab-confirmed SARS-CoV-2 infection, collecting both symptom status and repeat qPCR measurements. They provide peak CT values for asymptomatic (n=290), mildly ill patients (n=1319), and more severely ill patients (n=434). Asymptomatic patients have a peak median CT value 0.99 points higher than that of mildly ill patients (the study contains vaccinated individuals which is an important confounder for symptom status and viral load).

It’s unclear what accounts for the large inconsistency between studies. For the purposes of this analysis, we calculated the unweighted average CT value across the four studies (2.27 CT units higher for asymptomatics compared to those who were or later became symptomatic) but this is at best a crude measure.

Appendix 2: Supplemental Figures and Tables

Code

def get_fig_s1():
    ra_lists = list(get_study_ras().values())
    study_labels = list(get_study_ras().keys())
    composite_ras = get_composite_ras()
    study_labels.append("Composite Data")
    ra_lists.append(composite_ras)
    raw_distributions = []
    for ras in ra_lists:
        samples = get_logit_normal_samples(ras)
        raw_distributions.append(samples)

    fig, ax = plt.subplots(1, 1, figsize=(10, 4.2))
    log_distributions = [np.log10(distribution) for distribution in raw_distributions]
    log_rel_abuns = [np.log10(ras) for ras in ra_lists]

    viridis_palette = sns.color_palette("viridis", n_colors=len(study_labels) - 1)
    composite_study_color = "#aedc31"
    viridis_palette.append(composite_study_color)

    sns.violinplot(
        data=log_distributions,
        ax=ax,
        inner=None,
        palette=viridis_palette,
        orient="h",
        zorder=1,
        linewidth=0.5,
        cut=0,
        alpha=1,
        fill=False,
        bw_adjust=1.5,
    )

    sns.stripplot(
        data=log_rel_abuns,
        ax=ax,
        jitter=True,
        palette=viridis_palette,
        orient="h",
        alpha=0.7,
        zorder=2,
    )
    ax.set_yticks(ax.get_yticks())
    ax.set_yticklabels(study_labels)

    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.spines["left"].set_visible(False)
    ax.set_ylabel("")

    y_labels = [
        (
            label.get_text().rsplit(" ", 1)[0]
            + "\n"
            + label.get_text().rsplit(" ", 1)[1]
            if " " in label.get_text()
            else label.get_text()
        )
        for label in ax.get_yticklabels()
    ]

    ax.tick_params(axis="y", which="both", left=False, right=False, labelleft=True)
    # x_min, x_max = ax.get_xlim()
    new_x_max = 0
    new_x_min = -10
    ax.set_xlim(new_x_min, new_x_max)
    current_xticks = ax.get_xticks()

    ax.set_xticks(current_xticks)
    ax.set_xticklabels(
        [
            "$10^{{{}}}$".format(int(value)) if value != 0 else "1"
            for value in current_xticks
        ]
    )

    for x in np.arange(math.ceil(new_x_min), 1, 1):
        ax.axvline(
            x=x, color="black", linestyle="--", linewidth=0.3, alpha=0.2, zorder=-1
        )
    ax.set_xlabel("Relative Abundance")

    plt.tight_layout()
    plt.show()


get_fig_s1()

Figure 8: Raw relative abundance values and logit-normal distributions for each study and the composite dataset.

Code

# Calculate zero count fractions for each sample size
def get_fig_zero_count():
    composite_small_sample_sizes = get_composite_p2ra(
        swab_sample_sizes=SMALL_SWAB_SAMPLE_SIZES
    )

    composite_normal_sample_size = get_composite_p2ra()
    df_swab = pd.concat([composite_small_sample_sizes, composite_normal_sample_size])

    zero_count_fractions = []
    sample_sizes = []
    for swab_sample_size in sorted(df_swab["Sample Size"].unique()):
        subset_df = df_swab[df_swab["Sample Size"] == swab_sample_size]
        zero_count_fraction = round((subset_df["Relative Abundance"] == 0).mean(), 2)
        zero_count_fractions.append(zero_count_fraction)
        sample_sizes.append(str(swab_sample_size))

    fig, ax = plt.subplots(figsize=(10, 3), dpi=800)
    bars = ax.bar(
        sample_sizes, zero_count_fractions, color=sns.color_palette("viridis", 1)
    )

    ax.set_xlabel("Pool Size (Number of Swabs)")
    ax.set_ylabel("Zero Count %")
    ax.set_ylim(0, 1)

    for bar in bars:
        height = bar.get_height()
        ax.text(
            bar.get_x() + bar.get_width() / 2.0,
            height,
            f"{height*100:.0f}%",
            ha="center",
            va="bottom",
            fontsize=9,
        )
    # Convert y-axis ticks to percentages
    ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: "{:.0%}".format(y)))
    ax.spines["right"].set_visible(False)
    ax.spines["top"].set_visible(False)
    ax.tick_params(axis="y", which="both", left=False, right=False, labelleft=True)

    for y in np.arange(0, 1.1, 0.1):
        ax.axhline(
            y=y, color="grey", linestyle="--", alpha=0.2, linewidth=0.5, zorder=-1
        )

    plt.tight_layout()
    plt.show()


get_fig_zero_count()

Figure 9: Number of zero relative abundance results when using different pool sizes.

Sample Type	Sample Size	Median	IQR	Mean	Zero Count Fraction	Fold difference vs Average (median)
Lu et al. 2021	100	2.4e-05	0.00049	0.0009	0.28	235
Lu et al. 2021	200	0.00015	0.001	0.0009	0.09	1470
Lu et al. 2021	400	0.0004	0.0013	0.00086	0.01	3900
Babiker et al. 2020	100	1.5e-06	2.3e-05	0.00011	0.28	15
Babiker et al. 2020	200	7.8e-06	4.9e-05	0.00012	0.08	75
Babiker et al. 2020	400	2.1e-05	8.1e-05	0.00012	0.01	205
Mostafa et al. 2020	100	2.4e-07	3.5e-06	6.8e-06	0.44	0
Mostafa et al. 2020	200	1.3e-06	5e-06	6.4e-06	0.2	15
Mostafa et al. 2020	400	2.5e-06	5.4e-06	6.2e-06	0.03	25
Rodriguez et al. 2021	100	3.9e-06	0.00012	0.00051	0.29	40
Rodriguez et al. 2021	200	4.1e-05	0.0004	0.00057	0.09	405
Rodriguez et al. 2021	400	0.00015	0.00075	0.00057	0.01	1470
Original Composite Data	100	2.7e-06	6.9e-05	0.00038	0.31	25
Original Composite Data	200	2e-05	0.00019	0.00034	0.09	200
Original Composite Data	400	6.8e-05	0.00033	0.00034	0.01	670

Table 3: Comparison of study-level and unadjusted composite swab sampling relative abundance to wastewater relative abundance at 1% weekly incidence.

Sample Type	Sample Size	Median	IQR	Mean	Zero Count Fraction	Fold difference vs Average (median)
Original Composite Data	50	0	1.1e-05	0.00031	0.56	0
Original Composite Data	75	3.4e-07	3.2e-05	0.00033	0.42	5
Adjusted Composite Data	50	0	1e-05	0.00027	0.56	0
Adjusted Composite Data	75	3.2e-07	2.6e-05	0.00028	0.42	5

Table 4: Comparison of swab sampling relative abundance to wastewater relative abundance at 1% weekly incidence, assuming swab sample sizes of 50 and 75.

Footnotes

Rodriguez et al. (2021) generously shared qPCR and MGS data. We also thank Lu et al. (2021) and Mostafa et al. (2020) for answering questions about methodological details.↩︎
At pool sizes of 75 swabs, and 50 swabs respectively, 40%, and 54% of pools contain no SARS-CoV-2 at 1% weekly incidence (Table 4, Figure 9).↩︎
We picked weekly incidence as the disease indicator as this allows a comparison to our wastewater work, where we used the same metric.↩︎
3 days approximates the doubling time of wild-type SARS-CoV-2 at the beginning of the pandemic (Pellis et al. 2021). Assuming simple exponential growth in the number of cases, the current prevalence given a weekly incidence I is equal to P = I \times \frac{W}{W-1}, where W is the disease’s weekly growth rate. Assuming the 3 day doubling time, the weekly growth rate in this model is 5.040; at 1% weekly incidence, this gives a current prevalence of 1.248%.↩︎
For all swab sampling studies, we analyze samples that test positive in qPCR measurements, including qPCR-positive samples that have a SARS-CoV-2 relative abundance of 0.0.↩︎
They also ran MGS on samples of thirty individuals who tested negative to identify the underlying pathogen.↩︎
In qPCR, a fluorescently tagged genetic target sequence undergoes repeated amplification cycles (one cycle is ~one doubling). The fluorescent signal is measured after each amplification cycle. The cycle threshold is defined as the number of cycles required for the signal to become measurable. A low CT value thus indicates the target sequence was present in higher quantities in the initial sample, as it took fewer amplification cycles for the sequence to become detectable. Conversely, a higher CT value suggests a lower initial concentration of the target sequence. See here for additional information.↩︎
Just as a log-normal distribution models data whose logarithm is normally distributed, a logit-normal distribution models data with normally distributed logit. That is, for some normal random variable Z, logistic(Z) has a logit-normal distribution. Since for small x, logistic(x) ≈ exp(x), the logit-normal distribution functions similarly to the log-normal distribution in this case, with the added advantage that logistic-scale values are constrained to the unit interval — as is appropriate for relative abundances (more info).↩︎
Results for sampling with pools of 50 and 75 swabs can be found in Table 4.↩︎
35.0% of cases are asymptomatic, and 65.0% are symptomatic. Given a prevalence of 1.25%, 0.44% of individuals are infected and asymptomatic and 0.81% are symptomatic.↩︎
This is because a lower weekly growth rate results in higher prevalence when weekly incidence reaches 1% (see footnote 3).↩︎
In comparing the performance of different sample types (e.g., NP swab vs nasal swab), any positive sample was treated as ground truth (there is thus no possibility of having false positives).↩︎