Nucleic Acid Observatory

Investigating the sensitivity of pooled swab sampling for pathogen early detection

When implemented at scale, sequencing swabs in non-hospital settings could offer reliable pathogen-agnostic early detection
Authors Simon Grimm & Will Bradshaw
Date July 1, 2024
Code
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import math
import statsmodels.formula.api as smf
from IPython.display import Markdown
from tabulate import tabulate
from typing import List
from scipy.stats import gmean, linregress, norm
from collections import namedtuple
from collections import defaultdict

DEFAULT_SWAB_SAMPLE_SIZES = [100, 200, 400]
SMALL_SWAB_SAMPLE_SIZES = [50, 75]
ASYMPTOMATIC_SHARE = 0.35
TARGET_INCIDENCE_W = 0.01
DOUBLING_PERIOD_D = 3

DEBUG = False

WW_STUDY_TITLES = ["Rothman et al. 2021", "Crits-Christoph et al. 2021"]


def get_num_and_exp(x):
    scientific_x = f"{x:.1e}"
    number, exponent = scientific_x.split("e")[0], int(scientific_x.split("e")[1])
    return number, exponent


def logit(x):
    return np.log(x / (1 - x))


def logistic(x):
    return 1 / (1 + np.exp(-x))


def get_studies():
    df_op_lu = pd.read_csv(
        "../data/2024-06-17-swab-sensitivity/lu_op_ct_mgs.tsv", sep="\t", skiprows=1
    )  # Data obtained from Table S1.
    df_op_lu.rename(
        columns={"SCV-2 Relative Abundance": "scv2_ra", "Ct value": "scv2_ct"},
        inplace=True,
    )
    df_op_lu[["patient_status", "swab_type", "Study"]] = [
        "Inpatient",
        "op",
        "Lu et al. 2021",
    ]

    df_np_rodriguez = pd.read_csv(
        "../data/2024-06-17-swab-sensitivity/rodriguez_np_ct_mgs.csv", sep=";"
    )  # Data sent to us by authors.
    rodriguez_patient_status_dict = {
        "Hospit": "Inpatient",
        "Out_Patient": "Outpatient",
        "Intensive_Care": "ICU",
    }
    df_np_rodriguez["patient_status"] = df_np_rodriguez["Group"].replace(
        rodriguez_patient_status_dict
    )
    df_np_rodriguez["scv2_ra"] = (
        df_np_rodriguez["Reads_2019_CoV"] / df_np_rodriguez["Reads_Total"]
    )

    df_np_rodriguez.rename(columns={"CoV_Ct_number": "scv2_ct"}, inplace=True)
    df_np_rodriguez[["swab_type", "Study"]] = ["np", "Rodriguez et al. 2021"]

    df_np_babiker = pd.read_csv(
        "../data/2024-06-17-swab-sensitivity/babiker_np_ct_mgs.tsv",
        sep="\t",
        skiprows=1,
    )  # Data obtained from table S2
    df_np_babiker.rename(
        columns={
            "SARS-CoV-2 RT-PCR Ct": "scv2_ct",
            "SARS-CoV-2 RA": "scv2_ra",
            "Inpatient/ED vs. Outpatient": "patient_status",
        },
        inplace=True,
    )
    df_np_babiker["scv2_ct"] = (
        df_np_babiker["scv2_ct"].replace(",", ".", regex=True).astype(float)
    )
    df_np_babiker["patient_status"] = df_np_babiker["patient_status"].apply(
        lambda x: x if x in ["Inpatient", "Outpatient"] else "Unknown"
    )
    # The data uses . to represent missing data. Set this column to integers, while at the same time mapping missing data to NA.
    df_np_babiker["days_from_onset"] = (
        df_np_babiker["Day of Testing Relative to Symptom Onset"]
        .replace(".", "-1")
        .astype(int)
        .replace(-1, "NA")
    )
    df_np_babiker[["swab_type", "Study"]] = ["np", "Babiker et al. 2020"]

    df_np_mostafa = pd.read_csv(
        "../data/2024-06-17-swab-sensitivity/mostafa-np-ra-ct.tsv", sep="\t"
    )  # Data obtained from Table S2.
    mostafa_severity_dict = {
        1: "Required\nventilator",
        2: "ICU",
        3: "Inpatient",
        4: "Outpatient",
        0: "Unknown",
    }
    df_np_mostafa.rename(
        columns={
            "SARS-CoV-2 RT-PCR Ct value": "scv2_ct",
            "CosmosID Proportion Mapped to SARS-CoV-2": "scv2_ra",
        },
        inplace=True,
    )
    df_np_mostafa["Severity index"] = df_np_mostafa["Severity index"].replace("–", 0)
    df_np_mostafa["patient_status"] = (
        df_np_mostafa["Severity index"].astype(int).replace(mostafa_severity_dict)
    )
    # There is no information of why some patients only have "<7" as their days from onset. We set it to 3.5 (the average of 1-6 days.)
    df_np_mostafa["days_from_onset"] = df_np_mostafa["No. of days from onset"].replace(
        {"–": "NA", "<7": "3.5"}
    )
    # Drop samples unless we have both qPCR and MGS detection
    df_np_mostafa = df_np_mostafa[df_np_mostafa["COVID-19-positive"] == True]
    df_np_mostafa = df_np_mostafa[df_np_mostafa["scv2_ct"] != "–"]
    df_np_mostafa["scv2_ct"] = df_np_mostafa["scv2_ct"].astype(float)
    df_np_mostafa[["swab_type", "Study"]] = ["np", "Mostafa et al. 2020"]

    study_dfs = {
        "Lu et al. 2021": df_op_lu,
        "Babiker et al. 2020": df_np_babiker,
        "Mostafa et al. 2020": df_np_mostafa,
        "Rodriguez et al. 2021": df_np_rodriguez,
    }
    return study_dfs


def get_study_ras():
    studies = get_studies()
    study_ras = {}
    for title, df in studies.items():
        study_ras[title] = df["scv2_ra"].tolist()
    return study_ras


def get_composite_ras():
    study_ras = get_study_ras().values()
    composite_swab_ras = sum(study_ras, [])
    return composite_swab_ras


def get_adjusted_composite_ras():
    study_dfs = get_studies().values()
    composite_df = pd.concat(study_dfs)
    composite_df = composite_df[
        composite_df["patient_status"].isin(["Inpatient", "Outpatient"])
    ]
    zero_ras = composite_df[composite_df["scv2_ra"] == 0]["scv2_ra"].tolist()
    symptom_status_dfs = adjust_cts(composite_df)
    symptom_status_ras = defaultdict(list)
    for symptom_status in ["Asymptomatic", "Symptomatic"]:
        df = symptom_status_dfs[symptom_status]
        df = df[df["scv2_ra"] != 0]
        df = adjust_rel_abun(df)
        ras = df["adjusted_scv2_ra"].tolist() + zero_ras
        symptom_status_ras[symptom_status] = ras

    return symptom_status_ras


def get_study_p2ra(swab_sample_sizes=DEFAULT_SWAB_SAMPLE_SIZES):
    study_ras = get_study_ras().values()
    study_titles = get_study_ras().keys()

    study_p2ra_df = get_swab_p2ra(study_titles, study_ras, swab_sample_sizes)
    return study_p2ra_df


def get_composite_p2ra(swab_sample_sizes=DEFAULT_SWAB_SAMPLE_SIZES):
    composite_ras = get_composite_ras()
    composite_p2ra_df = get_swab_p2ra(
        ["Original Composite Data"], [composite_ras], swab_sample_sizes
    )
    return composite_p2ra_df


def get_adjusted_composite_p2ra(swab_sample_sizes=DEFAULT_SWAB_SAMPLE_SIZES):
    symptom_status_ras = get_adjusted_composite_ras()

    asymptomatic_p2ra_df = get_swab_p2ra(
        ["Adjusted Composite Data"],
        [symptom_status_ras["Asymptomatic"]],
        swab_sample_sizes,
    )
    symptomatic_p2ra_df = get_swab_p2ra(
        ["Adjusted Composite Data"],
        [symptom_status_ras["Symptomatic"]],
        swab_sample_sizes,
    )

    asymptomatics = asymptomatic_p2ra_df.sample(
        frac=ASYMPTOMATIC_SHARE, random_state=42
    )
    symptomatics = symptomatic_p2ra_df.sample(
        frac=1 - ASYMPTOMATIC_SHARE, random_state=42
    )

    composite_adjusted_p2ra_df = pd.concat(
        [asymptomatics, symptomatics], ignore_index=True
    )
    return composite_adjusted_p2ra_df


def get_asymptomatic_factor():
    # https://doi.org/10.1371/journal.pone.0270694
    long_2020_asymptomatic_delta = 0  # "The initial CT values for 37 asymptomatic individuals and 37 symptomatic patients appeared similar" https://www.nature.com/articles/s41591-020-0965-6
    lee_2020_asymptomatic_delta = 0  # "There were no significant differences in CT values between asymptomatic and symptomatic (including presymptomatic) patients." 10.1001/jamainternmed.2020.3862
    yang_2023_asymptomatic_delta = 0.99  # Extracted from supplement figure 4D https://doi.org/10.1016/S2666-5247(23)00139-8

    hall_asymptomatic_ct_median = 29.9
    hall_symptomatic_ct_median = 21.8
    hall_asymptomatic_delta = hall_asymptomatic_ct_median - hall_symptomatic_ct_median
    ASYMPTOMATIC_ADJUSTMENT_FACTOR = (
        hall_asymptomatic_delta
        + long_2020_asymptomatic_delta
        + lee_2020_asymptomatic_delta
        + yang_2023_asymptomatic_delta
    ) / 4

    return ASYMPTOMATIC_ADJUSTMENT_FACTOR


def adjust_cts(df):
    ASYMPTOMATIC_ADJUSTMENT_FACTOR = get_asymptomatic_factor()

    np_data = pd.read_csv(
        "../data/2024-06-17-swab-sensitivity/2024-06-18-np-nasal-ct.tsv",
        sep="\t",
        skiprows=1,
    )
    np_means = np_data.mean()

    NP_ADJUSTMENT_FACTOR = np_means.mean()
    goodall_data = pd.read_csv(
        "../data/2024-06-17-swab-sensitivity/goodall-op-nasal-ct.tsv",
        sep="\t",
        skiprows=2,
        header=None,
    )
    OP_ADJUSTMENT_FACTOR = goodall_data[0].mean()
    if DEBUG:
        print(f"Goodall mean (OP): {op_goodall_mean}")
        print(f"NP mean: {NP_ADJUSTMENT_FACTOR}")

    df["adjusted_scv2_ct"] = df["scv2_ct"]
    # Subtract the adjustment factors from the CT values (NP_ADJUSTMENT_FACTOR is negative, so it increases the CT values)
    df.loc[df["swab_type"] == "np", "adjusted_scv2_ct"] -= NP_ADJUSTMENT_FACTOR
    df.loc[df["swab_type"] == "op", "adjusted_scv2_ct"] -= OP_ADJUSTMENT_FACTOR
    df_symptomatic = df.copy()
    df_asymptomatic = df.copy()
    df_asymptomatic["adjusted_scv2_ct"] += ASYMPTOMATIC_ADJUSTMENT_FACTOR

    symptom_status_dfs = {
        "Asymptomatic": df_asymptomatic,
        "Symptomatic": df_symptomatic,
    }

    return symptom_status_dfs


def adjust_rel_abun(composite_df):
    composite_df = composite_df.copy()
    composite_df.loc[:, "scv2_ra_logit"] = composite_df["scv2_ra"].apply(logit)

    slope, intercept, r_value, p_value, std_err = linregress(
        composite_df["scv2_ct"], composite_df["scv2_ra_logit"]
    )
    composite_df["adjusted_scv2_ra_logit"] = (
        intercept + slope * composite_df["adjusted_scv2_ct"]
    )
    residuals = composite_df["scv2_ra_logit"] - (
        intercept + slope * composite_df["scv2_ct"]
    )

    sigma_squared = np.var(residuals, ddof=2)

    sigma = np.sqrt(sigma_squared)

    noise = np.random.normal(loc=0, scale=sigma, size=len(composite_df))

    composite_df["adjusted_scv2_ra_logit_with_noise"] = (
        composite_df["adjusted_scv2_ra_logit"] + noise
    )
    composite_df["adjusted_scv2_ra"] = composite_df[
        "adjusted_scv2_ra_logit_with_noise"
    ].apply(logistic)
    return composite_df


def get_swab_p2ra(subset_titles, ra_lists, swab_sample_sizes=DEFAULT_SWAB_SAMPLE_SIZES):
    df_swab = pd.DataFrame()
    for subset, ras in zip(subset_titles, ra_lists):
        samples = get_logit_normal_samples(ras)
        df = simulate_p2ra_many(samples, swab_sample_sizes, n_simulations=10000)
        df["Subset"] = subset
        df_swab = pd.concat(
            [
                df_swab,
                df.melt(
                    id_vars="Subset",
                    value_vars=swab_sample_sizes,
                    var_name="Sample Size",
                    value_name="Relative Abundance",
                ),
            ]
        )

    return df_swab


def get_logit_normal_samples(ras):
    ra_values = np.array(ras)
    zero_share = (ra_values == 0).mean()
    ra_values = ra_values[ra_values != 0]
    logit_ra_values = logit(ra_values)
    mean, std = np.mean(logit_ra_values), np.std(logit_ra_values)
    norm_dist = norm(loc=mean, scale=std)
    logit_samples = norm_dist.rvs(size=int(100000 * (1 - zero_share)))
    samples = logistic(logit_samples)
    samples = np.append(samples, np.zeros(int(100000 * zero_share)))
    np.random.shuffle(samples)
    return samples


def get_prevalence():
    days_in_week = 7
    growth_factor = 2 ** (1 / DOUBLING_PERIOD_D)
    weekly_growth = growth_factor**days_in_week
    weekly_incidence = 0.01
    prevalence = weekly_incidence * (weekly_growth) / (weekly_growth - 1)

    if DEBUG:
        # Validate that calculated prevalence gives correct weekly incidence
        # Prevalence on each past day in the previous week
        past_prevalence = [
            prevalence / growth_factor**x for x in range(days_in_week + 1)
        ]  # +1 to include prevalence on day 0
        # Incidence on each past day in the previous week
        past_incidence = -np.diff(past_prevalence)

        assert np.isclose(sum(past_incidence), weekly_incidence)
    return prevalence


def simulate_p2ra_many(ra_lists, sample_populations, n_simulations) -> pd.DataFrame:
    prevalence = get_prevalence()
    results = defaultdict(list)
    for sample_pop in sample_populations:
        for _ in range(n_simulations):
            results[sample_pop].append(simulate_p2ra(sample_pop, ra_lists, prevalence))
    for key, values in results.items():
        results[key] = sorted(values)
    df = pd.DataFrame(results)
    return df


def simulate_p2ra(sample_pop, ra_lists, prevalence):
    n_sick = np.random.poisson(sample_pop * prevalence)
    if n_sick == 0:
        return 0
    cumulative_ra_sick = 0
    for _ in range(n_sick):
        observation = np.random.choice(ra_lists)
        cumulative_ra_sick += observation
    individual_ra_sick = cumulative_ra_sick / n_sick
    relative_abundance = n_sick / sample_pop * individual_ra_sick
    return relative_abundance


def get_fig_2_dfs():
    df_swab = pd.concat([study_p2ra_df, composite_p2ra_df], ignore_index=True)

    df_ww = pd.DataFrame(
        {
            "Relative Abundance": rothman_p2ra_ras + crits_christoph_p2ra_ras,
            "Subset": ["Rothman et al. 2021"] * len(rothman_p2ra_ras)
            + ["Crits-Christoph et al. 2021"] * len(crits_christoph_p2ra_ras),
        }
    )
    return df_swab, df_ww


def get_fig_7_dfs():
    df_swab = pd.concat([composite_p2ra_df, adjusted_composite_p2ra_df])
    df_swab.reset_index(drop=True, inplace=True)
    df_ww = pd.DataFrame(
        {
            "Relative Abundance": rothman_p2ra_ras + crits_christoph_p2ra_ras,
            "Subset": ["Rothman et al. 2021"] * len(rothman_p2ra_ras)
            + ["Crits-Christoph et al. 2021"] * len(crits_christoph_p2ra_ras),
        }
    )

    return df_swab, df_ww


def get_ww_p2ra():
    df_wastewater = pd.read_csv(
        "../data/2024-06-17-swab-sensitivity/2024-05-07-fits-subset.tsv", sep="\t"
    )  # Original fits are taken from p2ra-manuscript. Based on the v1 pipeline (https://github.com/naobservatory/mgs-pipeline). Subset to be small enough for Github with ../scripts/subset_fits_tsv.py.
    rothman_p2ra_ras = df_wastewater[
        (df_wastewater["study"] == "rothman")
        & (df_wastewater["location"] == "Overall")
        & (df_wastewater["pathogen"] == "sars_cov_2")
    ]["ra_at_1in100"].tolist()
    crits_christoph_p2ra_ras = df_wastewater[
        (df_wastewater["study"] == "crits_christoph")
        & (df_wastewater["location"] == "Overall")
        & (df_wastewater["pathogen"] == "sars_cov_2")
    ]["ra_at_1in100"].tolist()

    ww_p2ra = {
        "Rothman et al. 2021": rothman_p2ra_ras,
        "Crits-Christoph et al. 2021": crits_christoph_p2ra_ras,
    }
    return ww_p2ra


def plot_medians(ax, medians):
    patches = ax.collections
    for patch, median in zip(patches, medians):
        for path in patch.get_paths():
            vertices = path.vertices
            x_mid = median
            closest_x = min(vertices[:, 0], key=lambda x: abs(x - x_mid))
            y_upper = max(vertices[vertices[:, 0] == closest_x, 1])
            y_lower = min(vertices[vertices[:, 0] == closest_x, 1])
            ax.plot([x_mid, x_mid], [y_lower, y_upper], color="white", linewidth=1.5)


# Generate p2ra values one time before figures are created.
study_dfs = get_studies().values()
composite_p2ra_df = get_composite_p2ra()
adjusted_composite_p2ra_df = get_adjusted_composite_p2ra()
study_p2ra_df = get_study_p2ra()
ww_p2ra_dfs = get_ww_p2ra()
rothman_p2ra_ras = ww_p2ra_dfs["Rothman et al. 2021"]
crits_christoph_p2ra_ras = ww_p2ra_dfs["Crits-Christoph et al. 2021"]

ASYMPTOMATIC_ADJUSTMENT_FACTOR = get_asymptomatic_factor()
rounded_asymptomatic_factor = f"{ASYMPTOMATIC_ADJUSTMENT_FACTOR:.2f}"

asymptomatic_share_percentage = f"{int(ASYMPTOMATIC_SHARE * 100)}%"
symptomatic_share_percentage = f"{int((1 - ASYMPTOMATIC_SHARE) * 100)}%"

Acknowledgements: Thanks to Mike McLaren and Jeff Kaufman for helpful feedback on this report. Jeff Kaufman performed an initial exploratory analysis that prompted the creation of this longer report. We also thank the authors1 of the qPCR and metagenomic studies who shared their data upon request and answered our questions.

Summary

  • Respiratory swab sampling is central to infectious disease surveillance, but most swab sampling programs focus on targeted identification of specific pathogens. At the NAO, we’re interested in how a such a program would perform for more agnostic detection of novel pathogens.
  • To evaluate this, we collected published paired qPCR/MGS data from four studies of SARS-CoV-2 patients. Using these datasets, we predict the relative abundance of a SARS-CoV-2-like pathogen in sequencing data from pooled respiratory swabs, collected in public places at 1% weekly incidence. Based on our modeling we find the following:
    • A composite of four studies gives a median relative abundance of 1.3 \times 10-6 in a sample of 100 swabs at 1% weekly incidence, and 1.3 \times 10-5 in a sample of 200 swabs. The data this model is based on was adjusted to reflect differences between study patient groups and the expected population of our proposed sampling program.
    • These results make swab sampling and sequencing 20- and 130-fold more sensitive than wastewater sampling, when collecting and sequencing 100 and 200 swabs, respectively.
    • When collecting 400 swabs, we see a 405-fold improvement over wastewater sequencing. However, at this sample size, the costs associated with increased sampling effort may outweigh the benefits of additional sensitivity.
    • Small pools frequently contain no infected individuals (400 swabs = 1% no detect, 200 swabs = 10%, 100 swabs = 32%2). The increased median sensitivity of higher pool sizes is thus mostly mediated through a reduction in zero-abundance results.
  • At adequate pool sizes, the sensitivity of pooled swab sequencing appears superior to municipal wastewater.
  • To further investigate the potential value of swab sampling and sequencing for pathogen early warning, the NAO is now running a pilot surveillance program in the Boston area. In this program, we are collecting, pooling, and sequencing self-administered respiratory samples to search for novel pathogens using genetic engineering detection.

Introduction

Respiratory pathogens must pass through the pharynx, mouth, and nose during transmission and as such, are often found in high concentrations in anterior nasal, oropharyngeal, and nasopharyngeal swabs. As a result, we hypothesized that respiratory viruses show higher relative abundance (the fraction of all sequencing reads assigned to that pathogen) in respiratory swab samples than in other samples we’ve looked at previously, including wastewater and air samples. High relative abundance means less sequencing depth required for detection, which decreases sequencing costs. Combined with the relatively low regulatory hurdles associated with swabbing individuals in non-clinical settings, this would make pooled swab sampling a promising approach for detecting novel respiratory pathogens.

Non-clinical swab sampling programs have precedent. One such program is CDC’s Traveler-Based Genomic Surveillance (TGS) program which collects swabs from international travelers entering the US. CDC TGS has traditionally focused on specific pathogens, detected via quantitative polymerase chain reaction (qPCR) and amplicon sequencing. In this report, we evaluate the performance of a more pathogen-agnostic detection method: metagenomic sequencing (MGS) of respiratory swabs from the general public. To do this, we use existing sequencing data from swabs to estimate the expected relative abundance of a SARS-CoV-2-like pathogen under specific epidemiological assumptions. Specifically, we model a scenario where,

  • A SARS-CoV-2 like-pathogen is at 1% weekly incidence, growing exponentially with a doubling time of 3 days (wild-type COVID-19)3.
  • A sampling program collects swabs from the population in public places, like train stations, public parks, or sidewalks.
  • Samples are pooled and sequenced using untargeted metagenomic sequencing (MGS).

In our modeling, we use data from four different studies that performed MGS of swabs from individuals infected with SARS-CoV-2. Subsequently, we make some simple adjustments to try to account for differences between the individuals in these datasets and those who would be likely to contribute samples in non-medical settings. The results suggest pooled swab sampling and sequencing could be a viable approach for MGS-based early detection of SARS-CoV-2-like novel pathogens, with one to two orders of magnitude better sensitivity than a wastewater-based program.

Estimating sampling sensitivity with SARS-CoV-2 sequencing data

In clinical settings, untargeted MGS is typically only used as a last resort for severe undiagnosed infections. Nevertheless, there are many studies showing that MGS can detect pathogens in swab samples with decent sensitivity, among them SARS-CoV-2, influenza, and other pathogens including RSV, rhinovirus, adenovirus, and metapneumovirus. For the purposes of this analysis, we focus on studies that performed untargeted MGS on COVID-19 patients. We only use studies that performed qPCR alongside MGS, which gives us a mapping between targeted and untargeted quantification metrics that we can use for later adjustments (see Appendix)4. We found four such studies for which sufficient data were available:

  • Babiker et al. 2020 analyzed nasopharyngeal (NP) samples from 10 outpatients and 35 inpatients who were SARS-CoV-2 qPCR-positive5. Samples were collected in Atlanta (Georgia, USA); the median (IQR) time between symptom onset and sample collection was 5 (4 to 7) days.
  • Mostafa et al. 2020, sampled NP swabs from 50 patients, 26 of which tested positive for COVID-19 in both qPCR and MGS. Sampling took place at Johns Hopkins Hospital in Baltimore (Maryland, USA). Out of these 26, 17 have information on days between symptom onset and sampling, with a mean of 5.3 days.
  • Lu et al. 2021 analyzed oropharyngeal (OP) swab samples from inpatients in Wuhan (China). The paper provides limited information on patient disease severity or other characteristics, stating only that “Respiratory specimens (swabs) [were] collected from patients admitted to various Wuhan health care facilities.” There is no information on the time between symptom onset and the sampling date.
  • Rodriguez et al. 2021 collected NP swabs from 42 outpatients, 17 inpatients not requiring intensive care and 45 intensive care unit (ICU) patients. All patients were sampled at a hospital in Paris (France). There is no information on the time between symptom onset and the sampling date.

SARS-CoV-2 relative abundance, as reported by the authors of each study, varied widely both within and between studies (Figure 1); in particular, Lu et al. 2021 and Rodriguez et al. 2021 showed substantially higher relative abundances than Babiker and Mostafa. The greatest range of relative abundances is seen in Rodriguez et al. 2021, with values spanning close to 7 orders of magnitude. These differences could arise from differences in swab type, sample processing, disease severity, or other factors; crucially, each study performed its own computational analysis to arrive at SARS-CoV-2 relative abundance, which might add additional inter-study variability.

Code
study_dfs = get_studies()

fig, axs = plt.subplots(2, 2, figsize=(10, 8), dpi=900)

df_np_babiker = study_dfs["Babiker et al. 2020"]
df_np_rodriguez = study_dfs["Rodriguez et al. 2021"]
df_op_lu = study_dfs["Lu et al. 2021"]
df_np_mostafa = study_dfs["Mostafa et al. 2020"]

if DEBUG:
    for study_df in df_np_babiker, df_np_rodriguez, df_op_lu, df_np_mostafa:
        print(
            study_df["Study"].unique(),
            f"Relative Abundance: {gmean(study_df['scv2_ra'])}",
            f"qPCR CT: {gmean(study_df['scv2_ct'])}",
        )

composite_df = pd.concat([df_op_lu, df_np_babiker, df_np_mostafa, df_np_rodriguez])
viridis_palette = sns.color_palette("viridis", n_colors=composite_df["Study"].nunique())

sns.histplot(
    ax=axs[0, 0],
    data=composite_df,
    x="scv2_ra",
    hue="Study",
    palette=viridis_palette,
    bins=30,
    log_scale=True,
    element="bars",
    linewidth=0,
    multiple="stack",
    legend=False,
)

axs[0, 0].set_title("a", x=-0.08, y=1.0, ha="left", fontsize=10, fontweight="bold")
axs[0, 0].set_ylabel("Count")
axs[0, 0].set_xlabel("Relative Abundance")
axs[0, 0].tick_params(axis="x", which="minor", bottom=False)

sns_default = sns.color_palette(n_colors=composite_df["patient_status"].nunique())
sns.histplot(
    ax=axs[0, 1],
    data=composite_df,
    x="scv2_ra",
    hue="patient_status",
    palette=sns_default,
    bins=30,
    log_scale=True,
    element="bars",
    linewidth=0,
    multiple="stack",
    legend=False,
)
axs[0, 1].set_title("b", x=-0.08, y=1.0, ha="left", fontsize=10, fontweight="bold")
axs[0, 1].set_ylabel("Count")
axs[0, 1].set_xlabel("Relative Abundance")
axs[0, 1].tick_params(axis="x", which="minor", bottom=False)

sns.scatterplot(
    ax=axs[1, 0],
    data=composite_df,
    x="scv2_ra",
    y="scv2_ct",
    hue="Study",
    style="Study",
    palette=viridis_palette,
)
axs[1, 0].set_title("c", x=-0.08, y=1.0, ha="left", fontsize=10, fontweight="bold")
axs[1, 0].set_xlabel("Relative Abundance")
axs[1, 0].set_ylabel("qPCR cycle threshold value")
axs[1, 0].set_xscale("log")

axs[1, 0].tick_params(axis="x", which="minor", bottom=False)

sns.move_legend(
    axs[1, 0],
    "lower center",
    bbox_to_anchor=(0.5, -0.44),
    ncol=2,
    title=None,
    frameon=False,
    fontsize=11,
    markerscale=2,
)

sns.scatterplot(
    ax=axs[1, 1],
    data=composite_df,
    x="scv2_ra",
    y="scv2_ct",
    hue="patient_status",
    style="patient_status",
    palette=sns_default,
)
axs[1, 1].set_title("d", x=-0.08, y=1.0, ha="left", fontsize=10, fontweight="bold")
axs[1, 1].set_xlabel("Relative Abundance")
axs[1, 1].set_ylabel("qPCR cycle threshold value")
axs[1, 1].set_xscale("log")

axs[1, 1].tick_params(axis="x", which="minor", bottom=False)

sns.move_legend(
    axs[1, 1],
    "lower center",
    bbox_to_anchor=(0.5, -0.54),
    ncol=2,
    title=None,
    frameon=False,
    fontsize=11,
    markerscale=1.7,
)

for ax in axs.flat:
    ax.spines["right"].set_visible(False)
    ax.spines["top"].set_visible(False)
    ax.tick_params(axis="y", which="both", left=False, right=False, labelleft=True)
plt.tight_layout()
Figure 1: SARS-CoV-2 qPCR CT6 vs SARS-CoV-2 relative abundance (a, c) Data across studies (b,d) Data across patient types. Zero relative abundance values are not shown but account for 13 samples in Mostafa et al. 2020 (33.3%) and 1 sample in Rodriguez et al. 2021 (0.96%).

These data can be used to estimate the expected relative abundance observed at a particular disease incidence in the population, an analysis we previously performed for wastewater. To generate these estimates, we simulated swabbing individuals from the population at random, with pools of swabs sequenced collectively. The number of sick people in each sample is Poisson-distributed. SARS-CoV-2 relative abundance was calculated as the average across individuals in the pool, with the relative abundance of uninfected individuals assumed to be 0. The relative abundance of infected individuals also had some chance of being 0, based on the number of individuals in the data with a positive qPCR result but no SARS-CoV-2 detected in sequencing data; other infected individuals had an RA drawn from a logit-normal distribution fitted to the data. (see logit distributions in Figure 8).

Figure 2 displays the distribution of relative abundances from a simulation where SARS-CoV-2 incidence is 1% per week, with swabs pooled in batches of 100, 200, or 4007. For a pool size of 100 swabs, the median SARS-CoV-2 relative abundance ranges from 2.1 \times 10–7 (Mostafa et al. 2020) to 2.0 \times 10-5 (Lu et al. 2021). When combining data from all four studies, the median relative abundance is 2.5 \times 10-6 for 100 swabs. Detection sensitivity improves by nearly one order of magnitude when increasing the pool size to 200 swabs, with a relative abundance of 2.0 \times 10-5. More detailed results can be found in Table 3. Smaller pools frequently contain no SARS-CoV-2 (~32% for pools with 100 swabs, ~10% for 200 swabs, and ~1% for 400 swabs, Figure 9). The number of swabs significantly affects the results: medians for 100 and 200 swab pools are notably lower than the 400 swab median due to zero-abundance pools (see median indicators in Figure 2).

In comparison, the equivalent estimate for SARS-CoV-2 in wastewater is approximately 7.4 \times 10-8 in Rothman et al. (2021) and 1.4 \times 10-7 in Crits-Christoph et al. (2021). Compared to sequencing a pool of 100 swabs, relative abundance in Rothman and Crits-Christoph is thus 35 and 20 times lower. Compared to a 200-swab pool there is a 265- and 140-fold difference, respectively. Finally, averaging across the two wastewater studies, this gives an overall prediction of a 25- and 195-fold difference in relative abundance between municipal wastewater and community swabbing for SARS-CoV-2 for 100 and 200 swabs, respectively.

Code
def get_fig_2():
    df_swab, df_ww = get_fig_2_dfs()

    if DEBUG:
        for subset in df_swab["Subset"].unique():
            for swab_sample_size in DEFAULT_SWAB_SAMPLE_SIZES:
                subset_df = df_swab.query(
                    "Subset == @subset and `Sample Size` == @swab_sample_size"
                )
                median = subset_df["Relative Abundance"].median()
                zero_count_fraction = (subset_df["Relative Abundance"] == 0).mean()
                print(
                    f"Subset: {subset}, Sample Size: {swab_sample_size}, Fraction of Zero Counts: {zero_count_fraction}, Median: {median}"
                )

    df_ww["Relative Abundance"] = np.log10(df_ww["Relative Abundance"])
    with np.errstate(divide="ignore"):
        df_swab["Relative Abundance"] = np.log10(df_swab["Relative Abundance"])
    fig, axs = plt.subplots(
        2, 1, figsize=(8, 6), dpi=900, gridspec_kw={"height_ratios": [3, 1]}
    )
    fig.subplots_adjust(hspace=0.4)
    colors = sns.color_palette("viridis", len(DEFAULT_SWAB_SAMPLE_SIZES))

    sns.violinplot(
        x="Relative Abundance",
        y="Subset",
        ax=axs[0],
        hue="Sample Size",
        palette=colors,
        data=df_swab,
        linewidth=0,
        bw_adjust=1.5,
        dodge=0.4,
        cut=0.5,
    )

    swab_medians_logged = []

    for subset in df_swab["Subset"].unique():
        for swab_sample_size in DEFAULT_SWAB_SAMPLE_SIZES:
            subset_df = df_swab.query(
                "Subset == @subset and `Sample Size` == @swab_sample_size"
            )
            median_logged = subset_df["Relative Abundance"].median()
            swab_medians_logged.append(median_logged)

    plot_medians(axs[0], swab_medians_logged)

    sns.violinplot(
        x="Relative Abundance",
        y="Subset",
        ax=axs[1],
        color="#aedc31",
        data=df_ww,
        inner=None,
        linewidth=0,
        bw_adjust=1.5,
        width=0.5,
        dodge=0.2,
    )

    ww_medians_logged = []
    ww_medians = []
    for study in df_ww["Subset"].unique():
        median = np.median(df_ww[(df_ww["Subset"] == study)]["Relative Abundance"])

        ww_medians_logged.append(median)
        ww_medians.append(10**median)
    if DEBUG:
        print(ww_medians)
    plot_medians(axs[1], ww_medians_logged)

    axs[0].set_title(
        "a (Swabs)", x=-0.31, y=0.95, fontsize=10, ha="left", fontweight="heavy"
    )
    axs[1].set_title(
        "b (Wastewater)", x=-0.31, y=0.95, fontsize=10, ha="left", fontweight="bold"
    )

    for ax in axs:
        ax.spines["top"].set_visible(False)
        ax.spines["right"].set_visible(False)
        ax.spines["left"].set_visible(False)
        ax.set_ylabel("")
        ax.set_yticks(ax.get_yticks())
        y_labels = [
            (
                label.get_text().rsplit(" ", 1)[0]
                + "\n"
                + label.get_text().rsplit(" ", 1)[1]
                if " " in label.get_text()
                else label.get_text()
            )
            for label in ax.get_yticklabels()
        ]
        ax.set_yticklabels(y_labels)

        ax.tick_params(axis="y", which="both", left=False, right=False, labelleft=True)

    xmin, xmax = -10, 0
    axs[0].set_xlim(xmin, xmax)
    axs[1].set_xlim(xmin, xmax)

    for x in np.arange(math.ceil(xmin), 1, 1):
        axs[0].axvline(
            x=x, color="black", linestyle="--", linewidth=0.3, alpha=0.2, zorder=-1
        )
        axs[1].axvline(
            x=x, color="black", linestyle="--", linewidth=0.3, alpha=0.2, zorder=-1
        )

    current_xticks = axs[0].get_xticks()
    for ax in axs:
        ax.set_xticks(current_xticks)
        ax.set_xticklabels(
            [
                "$10^{{{}}}$".format(int(value)) if value != 0 else "1"
                for value in current_xticks
            ]
        )
    legend = axs[0].legend(
        title="Number of\nswabs",
        loc="center left",
        bbox_to_anchor=(1, 0.5),
        ncol=1,
        frameon=False,
    )
    legend._legend_box.align = "left"

    plt.tight_layout()


get_fig_2()
Figure 2: Expected relative abundance at 1% SARS-CoV-2 incidence for (a) pooled respiratory swabs and (b) wastewater sequencing (data from Grimm et al. 2024). Violins indicate probability density only over simulations for which relative abundance was >0%; zero-abundance results aren’t plotted but can make up a substantial fraction of results (Figure 9). Medians (indicated by white bars) incorporate these zero-abundance results.

Adjusting for confounders

Prima facie, the results presented in Figure 2 seem like an argument in favor of pooled swab sampling over wastewater sampling. However, the patients being sampled in these four studies are not equivalent to the random members of the public we’d expect to sample. Three particularly relevant differences are:

  1. Swab type: the four MGS studies all used either NP swabs (Babiker et al. 2020, Mostafa et al. 2020, and Rodriguez et al. 2021), or OP swabs (Lu et al. 2021). Conversely, we expect a public-targeted sampling program would most likely use anterior nasal (henceforth simply “nasal”) or combined nasal/OP swabs. NP swabs, in particular, are poorly suited to mass sampling, as they are quite uncomfortable and require a trained professional to perform (and are thus unsuited to self-swabbing).
  2. Disease severity: Most patients in the four MGS studies were hospitalized, but a public-targeted sampling program would primarily sample from individuals who were asymptomatic, pre-symptomatic, or only mildly ill.
  3. Disease onset: Most of the inpatients in the four MGS studies were sampled later in the course of their infection, but viral loads for SARS-CoV-2 (and many other diseases) are highest around symptom onset and decline over time. In public-facing sampling, we’d expect to sample infected people closer to the start of their infection than in hospitals.

In Appendix 1 we investigate each of these differences in detail and try to estimate the likely effect on the measured relative abundance at a given disease incidence. In summary we find the following:

  • NP swab samples are expected to be more sensitive than nasal swabs. To approximate nasal swabs NP swab samples thus get a +1.43 CT adjustment (becoming less sensitive), based on data from five different studies (Figure 4). Conversely OP swabs are less sensitive than nasal swabs. Based on data from two studies (Figure 5), OP swab samples receive a -0.92 CT adjustment (becoming more sensitive).
  • Asymptomatics generally show lower viral loads. Based on data from four studies, symptomatic patients who are adjusted to resemble asymptomatics thus receive a +2.27 CT adjustment.
  • Samples from ICU patients and patients on ventilators are expected to differ from mildly-ill or asymptomatic cases in ways that are hard to just adjust for. We thus drop ICU and ventilator samples from our analysis.
  • Comparisons of inpatient and outpatient samples are frequently confounded by disease onset. We thus do not adjust for inpatient and outpatient status.
  • Finally, though we created estimates on how CT values change as the time between disease onset and sampling increases (+0.4 CT units per day), Lu et al. 2021 and Rodriguez et al. 2021 have limited or no data on the time between symptom onset and sampling date. Therefore, we do not adjust for this confounder.

Modeling the sensitivity of a pooled swab sampling program

Based on these findings, we re-estimate the expected SARS-CoV-2 relative abundance at 1% weekly incidence in the following way:

  1. We create a composite dataset by combining samples from all studies.
  2. We apply a linear regression fitted to the original qPCR and logit-scale relative-abundance data, generating a relationship between qPCR and sequencing results.
  3. We create a new composite dataset by combining samples from all studies, dropping samples from ICU patients, and individuals on ventilators, and those with unknown patient status.
  4. We adjust CT values to represent nasal swabs by applying a +1.43 CT adjustment to NP swab samples and a -0.92 CT adjustment to OP samples.
  5. We create separate asymptomatic and symptomatic versions of the dataset, applying a further +2.27 CT to the former while leaving the latter unchanged.
  6. Separately for the asymptomatic and symptomatic distributions, we convert the adjusted CT values into adjusted SARS-CoV-2 relative abundances using the linear regression from step 2. We then fit a logit-normal distribution on the adjusted, non-zero RAs as above.
  7. We conduct Monte Carlo simulations of pooled swab samples, in which the overall relative abundance of the pool is the average of each swab, and the relative abundance of each swab in the pool is calculated as one of three states:
    1. Non-infected: relative abundance (RA) = 0.
    2. Infected, asymptomatic: RA drawn from adjusted RA distribution for asymptomatic cases.
    3. Infected, symptomatic: RA drawn from adjusted RA distribution for symptomatic cases.

The results are visualized in Figure 3 and summarized in Table 1. Comparing the 100 swab sample size results to the wastewater data, swab samples are 25-fold more sensitive than Rothman et al. 2021 (7.4\times 10-8) and 15-fold more sensitive than Crits-Christoph et al. 2021 (1.4\times 10-7). Using swab sample pools of 200, swab sampling sensitivity is 175- and 95-fold more sensitive than Rothman et al. 2021 and Crits-Christoph et al. 2021 respectively. Averaging across the two wastewater studies, this gives an overall prediction of a 20- and 130-fold difference in relative abundance between municipal wastewater and community swabbing for SARS-CoV-2 for 100 and 200 swabs, respectively.

Code
def get_fig_7():
    df_swab, df_ww = get_fig_7_dfs()

    with np.errstate(divide="ignore"):
        df_ww["Relative Abundance"] = np.log10(df_ww["Relative Abundance"])
        df_swab["Relative Abundance"] = np.log10(df_swab["Relative Abundance"])

    fig, axs = plt.subplots(
        2,
        1,
        figsize=(8, 4),
        dpi=900,
        gridspec_kw={"height_ratios": [1.5, 1]},
    )
    fig.subplots_adjust(hspace=0.4)
    colors = sns.color_palette("viridis", 3)
    sns.violinplot(
        x="Relative Abundance",
        y="Subset",
        ax=axs[0],
        hue="Sample Size",
        palette=colors,
        data=df_swab,
        linewidth=0,
        bw_adjust=1.5,
        dodge=0.4,
        cut=1,
    )
    swab_medians_logged = []
    for subset in df_swab["Subset"].unique():
        for swab_sample_size in DEFAULT_SWAB_SAMPLE_SIZES:
            median = np.median(
                df_swab[
                    (df_swab["Subset"] == subset)
                    & (df_swab["Sample Size"] == swab_sample_size)
                ]["Relative Abundance"]
            )
            swab_medians_logged.append(median)

    plot_medians(axs[0], swab_medians_logged)
    sns.violinplot(
        x="Relative Abundance",
        y="Subset",
        ax=axs[1],
        color="#aedc31",
        data=df_ww,
        linewidth=0,
        bw_adjust=1.5,
        width=0.5,
        dodge=0.4,
    )
    ww_medians_logged = []
    ww_medians = []
    for study in df_ww["Subset"].unique():
        median = np.median(df_ww[(df_ww["Subset"] == study)]["Relative Abundance"])

        ww_medians_logged.append(median)
        ww_medians.append(10**median)
    plot_medians(axs[1], ww_medians_logged)

    axs[0].set_title(
        "a (Swabs)", x=-0.31, y=0.95, fontsize=10, ha="left", fontweight="heavy"
    )
    axs[1].set_title(
        "b (Wastewater)", x=-0.31, y=0.95, fontsize=10, ha="left", fontweight="bold"
    )
    for ax in axs:
        ax.spines["top"].set_visible(False)
        ax.spines["right"].set_visible(False)
        ax.spines["left"].set_visible(False)
        ax.set_ylabel("")
        if ax == axs[1]:
            y_labels = [
                (
                    label.get_text().rsplit(" ", 1)[0]
                    + "\n"
                    + label.get_text().rsplit(" ", 1)[1]
                    if " " in label.get_text()
                    else label.get_text()
                )
                for label in ax.get_yticklabels()
            ]
            ax.set_yticks(ax.get_yticks())
            ax.set_yticklabels(y_labels)

        if ax == axs[0]:
            y_labels = [
                (
                    label.get_text().rsplit(" ", 1)[0]
                    + "\n"
                    + label.get_text().rsplit(" ", 1)[1]
                )
                for label in ax.get_yticklabels()
            ]
            ax.set_yticks(ax.get_yticks())
            ax.set_yticklabels(y_labels)
        ax.tick_params(axis="y", which="both", left=False, right=False, labelleft=True)
    xmin, xmax = -10, 0
    for x in np.arange(math.ceil(xmin), 1, 1):
        axs[0].axvline(
            x=x, color="black", linestyle="--", linewidth=0.3, alpha=0.2, zorder=-1
        )
        axs[1].axvline(
            x=x, color="black", linestyle="--", linewidth=0.3, alpha=0.2, zorder=-1
        )
    for ax in axs:
        ax.set_xlim(xmin, xmax)
        current_xticks = ax.get_xticks()
        ax.set_xticks(current_xticks)
        ax.set_xticklabels(
            [
                "$10^{{{}}}$".format(int(value)) if value != 0 else "1"
                for value in current_xticks
            ]
        )
    legend = axs[0].legend(
        title="Number of\nswabs",
        loc="center left",
        bbox_to_anchor=(1, 0.5),
        ncol=1,
        frameon=False,
    )
    legend._legend_box.align = "left"
    plt.tight_layout()


get_fig_7()
Figure 3: Expected relative abundance at 1% SARS-CoV-2 incidence for (a) pooled respiratory swabs and (b) wastewater sequencing (data from Grimm et al. 2024).
Sample Type Sample Size Median IQR Zero Count Fraction Fold difference vs Rothman Fold difference vs Crits-Christoph Fold difference vs Average
Original Composite Data 100 2.5e-06 6.2e-05 0.31 35 20 25
Original Composite Data 200 2e-05 0.00016 0.09 265 140 195
Original Composite Data 400 6.9e-05 0.00031 0.01 925 495 675
Adjusted Composite Data 100 1.8e-06 4.6e-05 0.31 25 15 20
Adjusted Composite Data 200 1.3e-05 0.00011 0.1 175 95 130
Adjusted Composite Data 400 4.1e-05 0.00019 0.01 550 295 405
Table 1: Comparison of swab sampling relative abundance to wastewater relative abundance at 1% weekly incidence.

Estimating the cost

Given the estimates of relative abundance calculated above, we can in turn compare the sequencing costs required to detect a novel SARS-CoV-2-like pathogen spreading in the population to those required for wastewater-based detection. Using our estimate of relative abundance at 1% weekly incidence for 100- and 200-swab pools, we can estimate the weekly sequencing depth V required for detection at 1% cumulative incidence like so8:

V={\text{detection threshold}\over{\text{100 RA}{1\%} \times \text{cumulative incidence}}}

Assuming a detection threshold of 100 cumulative reads, a 100-swab sample yields a weekly sequencing depth of 5.5 \times 107, or around 1.1 \times 107 reads per weekday. Increasing the sample to 200 swabs reduces this to 1.5 \times 106 per weekday.

In our previous analysis of wastewater sequencing, we used a sequencing cost estimate of $5,500 per billion read pairs on a NovaSeq 6000; at a 100-read detection threshold9, this gave an estimated yearly sequencing cost of about $387,000 for Rothman and $205,000 for Crits-Christoph. Using the same cost here gives a yearly sequencing cost for swabs samples of roughly $15,600 for 100-swab pools and $2,200 for 200-swab pools, a significant saving compared to wastewater.

Carrying out swab sequencing via fractional runs of large sequencers, however, is likely to incur large time delays, when it is even possible at all. Using a smaller sequencer, perhaps in-house, is more realistic but also significantly more expensive. For example, using a 25M read-pair MiSeq v3, ($3,760 per run10) would cost roughly $427,600 for a 100-swab sample and $59,900 for a 200-swab sample. A more comprehensive cost comparison would require more precise modeling of sample collection and preparation that’s beyond the scope of this post.

Conclusion

In this report, we evaluate the expected sensitivity of pooled swab sampling and untargeted metagenomic sequencing for detecting a SARS-CoV-2-like pathogen. By analyzing MGS data from COVID-19 studies and adjusting for swab type and symptom status, we estimated the relative abundance of SARS-CoV-2 reads that could be obtained at a 1% weekly incidence. Our results suggest that this approach may offer one to two orders of magnitude greater sensitivity compared to municipal wastewater, depending on the swab sample pool size.

This investigation has several obvious limitations. Most glaringly, we only used data on SARS-CoV-2, making extrapolation to other diseases (let alone hypothetical future ones) difficult. The adjustments made to account for confounding factors are fairly simplistic, and the results might change after more detailed (and time-consuming) analysis. Finally, even having made these adjustments, our relative abundance estimates span many orders of magnitude, leaving us with a high degree of uncertainty about the true efficacy of swab sampling for MGS-based detection of a SARS-CoV-2-like pathogen.

Pool size emerges as a critical variable in our analysis, with larger pools dramatically reducing the chance of a zero-abundance sample. In addition to pool size, the performance of pooled swab sampling in our simulations is highly sensitive to key epidemiological variables like pathogen doubling time. A slow-growing pathogen with a long shedding period (modeled in internal analyses) would yield higher sensitivity than the numbers presented here11.

Based on our results, we are cautiously optimistic about the performance of pooled swab sampling for metagenomic biosurveillance, especially compared to municipal wastewater. Averaging across the two wastewater studies considered here gives a fold-difference in relative abundance (at 1% weekly COVID-19 incidence) of 20 and 130 for the adjusted 100- and 200-swab estimate. This increase in relative abundance brings a concomitant decrease in the sequencing depth required for detection.

More research is required to confirm the viability of such a program in practice. In particular, both improving the performance of wet-lab protocols and streamlining swab sampling will be required to create additional savings. But our results suggest that swab sampling in non-hospital settings could plausibly be a sensitive way to detect novel pathogens. CDC’s TGS program (see above) demonstrates the logistical feasibility of large-scale swab sampling, collecting samples from almost 50,000 individuals in just over a year. Incorporating metagenomic sequencing into such a program in a cost-effective way might help identify and mitigate the spread of novel biological threats.

Appendix 1: Confounders

Swab type adjustment

Tsang et al. 2021 performed a meta-analysis to evaluate how well different swab types detect SARS-CoV-2 using qPCR. Swab types included nasal swabs (n=1622), OP swabs (n=388), and pooled nasal and throat swabs (n=719), each of which were compared with NP swabs12. Using this approach, pooled nasal/throat swabs have the best diagnostic performance with a sensitivity of 0.97 (0.93-1.00), while nasal swabs performed worse at 0.87 (0.80–0.93)

Sample Type Sensitivity Negative Predictive Value
Nasal swabs (n=1622) 0.87 (0.80–0.93) 0.95 (0.88–0.99)
Throat swabs (n=388) 0.75 (0.52–0.92) 0.96 (0.94–0.98)
Pooled nasal/throat swabs (n=719) 0.97 (0.93–1.00) 0.99 (0.98–1.00)
NP swab (n=7973, includes comparison with saliva) 0.94 (0.91–0.97) 0.99 (0.98–1.00)
Table 2: Comparison of different swab sample types. Adapted from Tsang et al. 2021

This study gives us binary information about a pathogen’s presence in different swab types. However, to quantify the difference between swab types, we need quantitative information about the average viral load of SARS-CoV-2 in each swab type. One way to quantify this difference is by calculating the difference in cycle threshold (CT) values obtained when running qPCR on paired swabs sampled from the same person, with a lower CT indicating a higher viral load.

To compare NP to nasal swabs, we examined five studies that performed paired sampling and SARS-CoV-2 qPCR on the same patients using both swab types: Patriquin et al. 2022, McCulloch et al. 2020, Pere et al. 2020, Kojima et al. 2020, and Tu et al. 2020 (Figure 4). The mean differences in CT values (NP – nasal) for the five studies were -3.05, -1.55, 0.97, -3.91 and 0.38, respectively. The unweighted average of these five values is -1.43, indicating that NP swabs are typically more sensitive than nasal swabs.

Code
def get_fig_3():
    df = pd.read_csv(
        "../data/2024-06-17-swab-sensitivity/2024-06-18-np-nasal-ct.tsv",
        sep="\t",
        skiprows=1,
    )

    df = df.melt(var_name="Study", value_name="CT Difference")

    pretty_study_names = {
        "Patriquin2022": "Patriquin et al. 2022",
        "McCulloch2020": "McCulloch et al. 2020",
        "Pere2020": "Pere et al. 2020",
        "Kojima2020": "Kojima et al. 2020",
        "Tu2020": "Tu et al. 2020",
    }

    df["Study"] = df["Study"].map(pretty_study_names).values

    mean_ct_diff = df.groupby("Study", as_index=False)["CT Difference"].mean()
    if DEBUG:
        print(mean_ct_diff)
        print(f"Mean CT Difference: {mean_ct_diff['CT Difference'].mean()}")
    fig = plt.figure(figsize=(8, 4), dpi=900)

    colors = sns.color_palette("viridis", len(df["Study"].unique()))

    sns.stripplot(
        data=df,
        y="Study",
        x="CT Difference",
        hue="Study",
        jitter=True,
        zorder=-1,
        palette=colors,
        alpha=0.5,
    )
    sns.pointplot(
        data=mean_ct_diff,
        y="Study",
        x="CT Difference",
        linestyles="none",
        markers="d",
        color="#36454F",
        markersize=7,
        errorbar=None,
        zorder=1,
    )
    plt.legend([], [], frameon=False)
    plt.ylabel("")
    plt.xlabel("SARS-CoV-2 qPCR CT Δ (NP - Nasal)")
    plt.tick_params(axis="y", which="both", left=False, right=False, labelleft=True)

    x_min, x_max = plt.xlim()

    plt.text(x_max / 2, -0.6, "Favors Nasal", fontsize=10, color="black", ha="center")
    plt.text(
        x_min / 2,
        -0.6,
        "Favors Nasopharyngeal",
        fontsize=10,
        color="black",
        ha="center",
    )
    x_min = math.ceil(x_min / 5) * 5
    x_max = math.ceil(x_max / 5) * 5

    x_marks = np.arange(x_min, x_max, 2.5)
    for x in x_marks:
        if x == 0:
            continue
        plt.axvline(
            x=x, color="grey", linestyle="--", alpha=0.5, linewidth=0.5, zorder=-2
        )

    plt.axvline(x=0, color="red", linestyle="--", alpha=0.5, zorder=-2)

    plt.gca().spines["right"].set_visible(False)
    plt.gca().spines["top"].set_visible(False)
    plt.gca().spines["left"].set_visible(False)
    plt.tight_layout()


get_fig_3()
Figure 4: Nasal swabs vs NP swabs. Data from Patriquin et al. 2022, McCulloch et al. 2020, Pere et al. 2020, Kojima et al. 2020, and Tu et al. 2020. Patriquin et al. 2022 and Pere et al. 2020 performed professional collection of both NP and nasal swabs; other studies performed professional collection of NP swabs but had patients collect nasal swabs themselves. Diamonds indicate mean values.

Similarly, to compare OP to nasal swabs, we examined two comparative studies: Berenger et al. 2020 and Goodall et al. 2022. Berenger et al. 2020 only give median, 10th, and 90th percentile values; assuming for the sake of tractability that these are drawn from a normal distribution, we get a difference between OP and nasal swabs of 0.89 (OP = 29.94, Nasal = 29.05). For Goodall, the mean difference is 0.99 (Figure 5). This yields an unweighted average of 0.92, indicating that OP swabs are less sensitive than nasal swabs.

Code
def get_fig_4():
    df = pd.read_csv(
        "../data/2024-06-17-swab-sensitivity/goodall-op-nasal-ct.tsv",
        sep="\t",
        skiprows=1,
    )
    pretty_study_names = {
        "Goodall2022": "Goodall et al. 2022",
    }
    df = df.melt(var_name="Study", value_name="CT Difference")
    df["Study"] = df["Study"].map(pretty_study_names).values
    mean_ct_diff = df.groupby("Study", as_index=False)["CT Difference"].mean()
    if DEBUG:
        print(mean_ct_diff)
        print(f"Mean CT Difference: {mean_ct_diff}")
    fig, ax = plt.subplots(figsize=(8, 1.65), dpi=900)

    viridis = sns.color_palette("viridis", 1)
    sns.stripplot(
        data=df,
        y="Study",
        x="CT Difference",
        hue="Study",
        jitter=True,
        zorder=-1,
        ax=ax,
        alpha=0.8,
        palette=viridis,
    )
    sns.pointplot(
        data=mean_ct_diff,
        y="Study",
        x="CT Difference",
        linestyles="none",
        markers="d",
        color="#36454F",
        markersize=8,
        errorbar=None,
        zorder=1,
        ax=ax,
    )
    ax.legend([], [], frameon=False)

    ax.set_ylabel("")
    ax.set_xlabel("SARS-CoV-2 qPCR CT Δ (OP - Nasal)")
    ax.tick_params(axis="y", which="both", left=False, right=False, labelleft=True)
    x_min, x_max = ax.get_xlim()

    ax.text(x_max / 2, -0.6, "Favors Nasal", fontsize=9, color="black", ha="center")
    ax.text(
        x_min / 2, -0.6, "Favors Oropharyngeal", fontsize=9, color="black", ha="center"
    )

    x_min = math.ceil(x_min / 5) * 5
    x_max = math.ceil(x_max / 5) * 5

    x_marks = np.arange(x_min, x_max, 2.5)
    for x in x_marks:
        if x == 0:
            continue
        ax.axvline(
            x=x, color="grey", linestyle="--", alpha=0.5, linewidth=0.5, zorder=-2
        )
    ax.axvline(x=0, color="red", linestyle="--", alpha=0.5, zorder=-2)

    ax.axhline(y=0.5, color="grey", linestyle="--", alpha=0.5, linewidth=0.5)

    ax.spines["right"].set_visible(False)
    ax.spines["top"].set_visible(False)
    ax.spines["left"].set_visible(False)
    plt.tight_layout()
    plt.show()


get_fig_4()

# Computing median of Berenger et al. 2020:


# Source: https://www.medrxiv.org/content/10.1101/2020.05.05.20084889v1.full#:~:text=Comparing%20the%20samples,respectively%20(p%3E0.09).
data = {
    "E_NP": {"median": 25.5, "10th": 20.5, "90th": 29.5},
    "E_Nasal": {"median": 27.6, "10th": 24.7, "90th": 32.4},
    "E_Throat": {"median": 28.7, "10th": 23.5, "90th": 34.2},
    "RdRp_NP": {"median": 27.9, "10th": 23.5, "90th": 32.4},
    "RdRp_Nasal": {"median": 30.5, "10th": 27.5, "90th": 35.0},
    "RdRp_Throat": {"median": 31.3, "10th": 26.5, "90th": 35.5},
}


def calculate_mean_std(median, p10, p90):
    mean = median
    std = (p90 - p10) / (2 * norm.ppf(0.9))
    norm_dist = norm(mean, std)
    arrived_p10 = norm_dist.ppf(0.1)
    arrived_p90 = norm_dist.ppf(0.9)
    if abs(p10 - arrived_p10) > 1 or abs(p90 - arrived_p90) > 1:
        print(
            f"Warning: p10 or p90 values are not within one point of the arrived percentiles. p10: {p10}, arrived_p10: {arrived_p10}, p90: {p90}, arrived_p90: {arrived_p90}"
        )
    return mean, std


x = np.linspace(10, 40, 1000)

distributions = []

for key, values in data.items():
    mean, std = calculate_mean_std(values["median"], values["10th"], values["90th"])
    distribution = np.random.normal(mean, std, 100000)

    distributions.append(distribution)

mean_of_means = {}
swab_types = ["NP", "Nasal", "Throat"]

# Not doing a weighted average because there are approximately the
# same number of samples for both qPCR targets.
for swab in swab_types:
    swab_means = [
        np.mean(distribution)
        for key, distribution in zip(data.keys(), distributions)
        if swab in key
    ]
    mean_of_means[swab] = np.mean(swab_means)
Figure 5: Nasal swabs vs Oro-pharyngeal swabs. Data from Goodall et al. 2022. Diamond indicates mean.

Patient status and disease onset adjustment

The four MGS studies we have examined include both inpatients and outpatients, with the latter making up 68 out of 196 individuals (36%) (Figure 6). The remaining study population consists of inpatients with either an unknown level of disease severity, or known ventilator or ICU usage.

Code
def get_fig_5():
    study_dfs = get_studies()

    df_np_babiker = study_dfs["Babiker et al. 2020"]
    df_np_rodriguez = study_dfs["Rodriguez et al. 2021"]
    df_op_lu = study_dfs["Lu et al. 2021"]
    df_np_mostafa = study_dfs["Mostafa et al. 2020"]

    order = ["Outpatient", "Inpatient", "Required\nventilator", "ICU", "Unknown"]

    lu_counts = df_op_lu["patient_status"].value_counts().reindex(order)
    babiker_counts = df_np_babiker["patient_status"].value_counts().reindex(order)
    mostafa_counts = df_np_mostafa["patient_status"].value_counts().reindex(order)
    rodriguez_counts = (
        df_np_rodriguez["patient_status"].value_counts().reindex(order).dropna()
    )

    if DEBUG:
        print(lu_counts, babiker_counts, mostafa_counts, rodriguez_counts)

    fig, axs = plt.subplots(
        1, 4, figsize=(10, 4), width_ratios=[0.7, 1.5, 2.4, 4], sharey=True, dpi=800
    )

    colors = sns.color_palette("viridis", 10)
    color_dict = {
        "Outpatient": colors[9],
        "Inpatient": colors[3],
        "Required\nventilator": colors[5],
        "ICU": colors[0],
        "Unknown": colors[7],
    }

    axs[0].bar(
        lu_counts.index, lu_counts.values, color=color_dict["Inpatient"], width=0.5
    )
    axs[0].set_title("Lu et al. 2021", fontsize=10)
    axs[0].set_ylabel("Count")

    axs[1].bar(
        babiker_counts.index,
        babiker_counts.values,
        color=[color_dict[x] for x in babiker_counts.index],
        width=0.8,
    )
    axs[1].set_title("Babiker et al. 2020", fontsize=10)

    axs[3].bar(
        mostafa_counts.index,
        mostafa_counts.values,
        color=[color_dict[x] for x in mostafa_counts.index],
    )
    axs[3].set_title("Mostafa et al. 2020", fontsize=10)
    axs[3].set_ylim(0, max(mostafa_counts.values) + 10)
    axs[3].set_yticks(np.arange(0, max(mostafa_counts.values) + 30, 10))

    axs[2].bar(
        rodriguez_counts.index,
        rodriguez_counts.values,
        color=[color_dict[x] for x in rodriguez_counts.index],
    )
    axs[2].set_title("Rodriguez et al. 2021", fontsize=10)
    axs[2].set_ylim(0, max(rodriguez_counts.values) + 5)
    axs[2].set_yticks(np.arange(0, max(rodriguez_counts.values) + 10, 5))

    for ax in axs:
        ax.spines["right"].set_visible(False)
        ax.spines["top"].set_visible(False)
        ax.tick_params(axis="y", which="both", left=False, right=False, labelleft=True)
        ax.tick_params(
            axis="x",
            which="both",
            top=False,
            bottom=False,
            labeltop=False,
            labelbottom=True,
            labelsize=9,
        )
        for x in np.arange(0, 50, 10):
            ax.axhline(
                y=x, color="grey", linestyle="--", alpha=0.5, linewidth=0.5, zorder=-1
            )

    plt.tight_layout()
    plt.show()


get_fig_5()

## Computing share of outpatients across studies

study_dfs = get_studies()
df_np_babiker = study_dfs["Babiker et al. 2020"]
df_np_rodriguez = study_dfs["Rodriguez et al. 2021"]
df_op_lu = study_dfs["Lu et al. 2021"]
df_np_mostafa = study_dfs["Mostafa et al. 2020"]

order = ["Outpatient", "Inpatient", "Required\nventilator", "ICU", "Unknown"]

lu_counts = df_op_lu["patient_status"].value_counts().reindex(order)
babiker_counts = df_np_babiker["patient_status"].value_counts().reindex(order)
mostafa_counts = df_np_mostafa["patient_status"].value_counts().reindex(order)
rodriguez_counts = (
    df_np_rodriguez["patient_status"].value_counts().reindex(order).dropna()
)

composite_counts = pd.DataFrame(
    {
        "Lu": lu_counts,
        "Babiker": babiker_counts,
        "Mostafa": mostafa_counts,
        "Rodriguez": rodriguez_counts,
    }
)

total_counts_across_studies = composite_counts.sum(axis=1)
share_outpatients_across_studies = (
    total_counts_across_studies["Outpatient"] / total_counts_across_studies.sum()
)
outpatient_share = f"{share_outpatients_across_studies:.0%}"
if DEBUG:
    print(outpatient_share)
Figure 6: Patient composition of Lu et al. 2021, Babiker et al. 2020, Mostafa et al. 2020, and Rodriguez et al. 2021

Greater COVID-19 disease severity is associated with increased viral load; all else equal, we thus expect hospitalized patients to show higher viral abundance than outpatients. However, viral loads are also inversely correlated with time since symptom onset. As it takes time for someone to become hospitalized, inpatients are generally tested later in the course of their disease than outpatients, resulting in lower viral loads. It’s not ex ante obvious which of these two effects we expect to dominate.

Perhaps unsurprisingly, given these opposing effects, different studies give inconsistent results when comparing SARS-CoV-2 viral loads between different groups of patients. Compared to inpatients, outpatients showed higher SARS-CoV-2 CT values in Babiker et al. (mean outpatient-inpatient difference of +1.17) but lower CT values in Rodriguez et al. (mean difference of -4.45). We identified two further studies that allowed us to make this comparison: Souverein et al. 2022 compared ~20,000 outpatients to ~300 inpatients and found an average CT difference of +1.3, while Knudtzen et al. 2021 compared 87 outpatients to 82 inpatients and found an average difference of -2.3. An unweighted average of the CT deltas in these four studies finds that outpatients have a mean CT value 1.07 lower than inpatients. However, it’s unclear that this number is very meaningful, given the large differences between studies and confounding effect of disease onset.

One of the four studies used above, Knudtzen et al. 2021, contains information on both patient status and symptom onset, allowing us to disentangle the two effects for this dataset (Figure 7). Fitting a simple linear model on this data, which takes into account patient status, the CT value increase per additional day is 0.37 (+- 0.13), while inpatient/outpatient status has no significant effect.

Code
def get_fig_6():
    df_knudtzen_ct_days = pd.read_csv(
        "../data/2024-06-17-swab-sensitivity/knudtzen2021_ct_days.tsv",
        sep="\t",
        skiprows=1,
    )
    df_knudtzen_ct_days["Patient Status"] = df_knudtzen_ct_days[
        "Patient Status"
    ].astype("category")
    df_knudtzen_ct_days["Day"] = df_knudtzen_ct_days["Day"].round()
    status_to_color = {
        status: idx
        for idx, status in enumerate(df_knudtzen_ct_days["Patient Status"].unique())
    }
    colors = sns.color_palette("viridis", len(status_to_color))

    reg = smf.ols('CT ~ Day + Q("Patient Status")', data=df_knudtzen_ct_days)
    results = reg.fit()
    if DEBUG:
        print(results.summary())

    fig, ax = plt.subplots(figsize=(6, 3.5), dpi=900)
    for status, color in zip(df_knudtzen_ct_days["Patient Status"].unique(), colors):
        subset = df_knudtzen_ct_days[df_knudtzen_ct_days["Patient Status"] == status]
        plt.scatter(
            subset["Day"],
            subset["CT"],
            c=[color],
            label=status,
            edgecolors="w",
            linewidths=0.5,
        )

        line_x_values = range(int(subset["Day"].min()), int(subset["Day"].max()) + 1)
        line_y_values = results.predict(
            pd.DataFrame(
                {"Day": line_x_values, "Patient Status": [status] * len(line_x_values)}
            )
        )

        plt.plot(line_x_values, line_y_values, color=color, linestyle="--", linewidth=2)

    plt.xlabel("Days from symptom onset")
    plt.ylabel("qPCR-CT Value")
    plt.legend(
        title="", loc="upper center", bbox_to_anchor=(0.5, -0.15), ncol=2, frameon=False
    )

    for val in range(0, 20, 5):
        ax.axvline(
            x=val, color="grey", linestyle="--", linewidth=0.5, alpha=0.3, zorder=-1
        )

    for val in range(15, 40, 5):
        ax.axhline(
            y=val, color="grey", linestyle="--", linewidth=0.5, alpha=0.3, zorder=-1
        )

    ax.set_xticks(range(0, int(df_knudtzen_ct_days["Day"].max()) + 1, 5))

    ax.spines["right"].set_visible(False)
    ax.spines["top"].set_visible(False)

    plt.tight_layout()
    plt.show()


get_fig_6()
Figure 7: Relationship between days since symptom onset and viral load. Data extracted from Knudtzen et al. 2021, Figure 3A.

A second relevant study is He et al. 2020, who collected 414 OP swabs sampled from 94 inpatients in Guangzhou, China, none of whom were critically ill. Samples were collected longitudinally for some of these patients, with time from symptom onset ranging from 0 to 32 days. Average CT values upon symptom onset were around 31 CT values, rising to 35 at 3-4 days and reaching the limit of detection (CT=40) after 21 days. Treating the change in CT units as linear, this gives an average increase of 0.43 CT units per day; similar to the result from Knudtzen et al. Integrating the results from both of these studies, we estimate that each passing day between symptom onset and swab sampling leads to an increase of roughly 0.4 CT units.

Asymptomatic individual adjustment

Most individuals sampled in the four MGS studies were symptomatic patients, but we expect most infected people who would be captured by a community swabbing program to be asymptomatic, presymptomatic, or only mildly ill. About one third of COVID-19 cases are persistently asymptomatic (Sah et al. 2021); as such, we need to account for how they differ from symptomatic cases.

To investigate this, we analyzed four studies that compared qPCR results of asymptomatic COVID cases to those that were either symptomatic at the time of testing or developed symptoms later:

  • Hall et al. 2022 analyzed the results of Boston University’s testing program, which tested all individuals who came to campus, and logged symptoms upon taking the test. Comparing the median of asymptomatic individuals (n=319) with that of symptomatic (n=512) shows asymptomatics to have a CT that is 8.1 points higher (29.9 - 21.8).
  • Long et al. 2020, taking place in Wanzhou District, China, compared the first positive SARS-CoV-2 qPCR result of 37 quarantined, persistently asymptomatic individuals to 37 symptomatic patients with similar characteristics (sex, age, and comorbidities) and found no clear difference in median CT values between the two groups.
  • Lee et al. 2022 studied patients in a South Korean isolation facility. They do not provide raw data or median values, but report there was no qPCR CT value difference between symptomatic (n=214) and asymptomatic patients (n=89).
  • Yang et al. 2023 performed a prospective cohort study of patients admitted to a Shenzhen hospital with lab-confirmed SARS-CoV-2 infection, collecting both symptom status and repeat qPCR measurements. They provide peak CT values for asymptomatic (n=290), mildly ill patients (n=1319), and more severely ill patients (n=434). Asymptomatic patients have a peak median CT value 0.99 points higher than that of mildly ill patients (the study contains vaccinated individuals which is an important confounder for symptom status and viral load).

It’s unclear what accounts for the large inconsistency between studies. For the purposes of this analysis, we calculated the unweighted average CT value across the four studies (2.27 CT units lower for asymptomatics compared to those who were or later became symptomatic) but this is at best a crude measure.

Appendix 2: Supplemental Figures and Tables

Code
def get_fig_s1():
    ra_lists = list(get_study_ras().values())
    study_labels = list(get_study_ras().keys())
    composite_ras = get_composite_ras()
    study_labels.append("Composite Data")
    ra_lists.append(composite_ras)
    raw_distributions = []
    for ras in ra_lists:
        samples = get_logit_normal_samples(ras)
        raw_distributions.append(samples)

    fig, ax = plt.subplots(1, 1, figsize=(7, 3), dpi=900)
    log_distributions = [np.log10(distribution) for distribution in raw_distributions]
    log_rel_abuns = [np.log10(ras) for ras in ra_lists]

    viridis_palette = sns.color_palette("viridis", n_colors=len(study_labels) - 1)
    composite_study_color = "#aedc31"
    viridis_palette.append(composite_study_color)

    sns.violinplot(
        data=log_distributions,
        ax=ax,
        inner=None,
        palette=viridis_palette,
        orient="h",
        zorder=1,
        linewidth=0.5,
        cut=0,
        alpha=1,
        fill=False,
        bw_adjust=1.5,
    )

    sns.stripplot(
        data=log_rel_abuns,
        ax=ax,
        jitter=True,
        palette=viridis_palette,
        orient="h",
        alpha=0.7,
        zorder=2,
    )
    ax.set_yticks(ax.get_yticks())
    ax.set_yticklabels(study_labels)

    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.spines["left"].set_visible(False)
    ax.set_ylabel("")

    y_labels = [
        (
            label.get_text().rsplit(" ", 1)[0]
            + "\n"
            + label.get_text().rsplit(" ", 1)[1]
            if " " in label.get_text()
            else label.get_text()
        )
        for label in ax.get_yticklabels()
    ]

    ax.tick_params(axis="y", which="both", left=False, right=False, labelleft=True)
    # x_min, x_max = ax.get_xlim()
    new_x_max = 0
    new_x_min = -10
    ax.set_xlim(new_x_min, new_x_max)
    current_xticks = ax.get_xticks()

    ax.set_xticks(current_xticks)
    ax.set_xticklabels(
        [
            "$10^{{{}}}$".format(int(value)) if value != 0 else "1"
            for value in current_xticks
        ]
    )

    for x in np.arange(math.ceil(new_x_min), 1, 1):
        ax.axvline(
            x=x, color="black", linestyle="--", linewidth=0.3, alpha=0.2, zorder=-1
        )
    ax.set_xlabel("Relative Abundance")

    plt.tight_layout()
    plt.show()


get_fig_s1()
Figure 8: Raw relative abundance values and logit-normal distributions for each study and the composite dataset.
Code
# Calculate zero count fractions for each sample size
def get_fig_zero_count():
    composite_small_sample_sizes = get_composite_p2ra(
        swab_sample_sizes=SMALL_SWAB_SAMPLE_SIZES
    )

    composite_normal_sample_size = get_composite_p2ra()
    df_swab = pd.concat([composite_small_sample_sizes, composite_normal_sample_size])

    zero_count_fractions = []
    sample_sizes = []
    for swab_sample_size in sorted(df_swab["Sample Size"].unique()):
        subset_df = df_swab[df_swab["Sample Size"] == swab_sample_size]
        zero_count_fraction = round((subset_df["Relative Abundance"] == 0).mean(), 2)
        zero_count_fractions.append(zero_count_fraction)
        sample_sizes.append(str(swab_sample_size))

    fig, ax = plt.subplots(figsize=(6, 3), dpi=800)
    bars = ax.bar(
        sample_sizes, zero_count_fractions, color=sns.color_palette("viridis", 1)
    )

    ax.set_xlabel("Pool Size (Number of Swabs)")
    ax.set_ylabel("Zero Count %")
    ax.set_ylim(0, 1)

    for bar in bars:
        height = bar.get_height()
        ax.text(
            bar.get_x() + bar.get_width() / 2.0,
            height,
            f"{height*100:.0f}%",
            ha="center",
            va="bottom",
            fontsize=9,
        )
    # Convert y-axis ticks to percentages
    ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda y, _: "{:.0%}".format(y)))
    ax.spines["right"].set_visible(False)
    ax.spines["top"].set_visible(False)
    ax.tick_params(axis="y", which="both", left=False, right=False, labelleft=True)

    for y in np.arange(0, 1.1, 0.1):
        ax.axhline(
            y=y, color="grey", linestyle="--", alpha=0.2, linewidth=0.5, zorder=-1
        )

    plt.tight_layout()
    plt.show()


get_fig_zero_count()
Figure 9: Zero count shares for different pool sizes.
Sample Type Sample Size Median IQR Zero Count Fraction Fold difference vs Rothman Fold difference vs Crits-Christoph Fold difference vs Average
Lu et al. 2021 100 2e-05 0.00041 0.29 270 145 200
Lu et al. 2021 200 0.00014 0.0009 0.08 1900 1010 1385
Lu et al. 2021 400 0.0004 0.0013 0.01 5455 2900 3975
Babiker et al. 2020 100 1.4e-06 2e-05 0.29 20 10 15
Babiker et al. 2020 200 8.1e-06 5.1e-05 0.08 110 60 80
Babiker et al. 2020 400 2.2e-05 8.1e-05 0.01 290 155 210
Mostafa et al. 2020 100 2.1e-07 3.5e-06 0.44 5 0 0
Mostafa et al. 2020 200 1.4e-06 5.1e-06 0.2 20 10 15
Mostafa et al. 2020 400 2.5e-06 5.6e-06 0.03 35 20 25
Rodriguez et al. 2021 100 3.7e-06 0.00012 0.29 50 25 35
Rodriguez et al. 2021 200 4.2e-05 0.0004 0.08 565 300 415
Rodriguez et al. 2021 400 0.00015 0.00073 0.01 2080 1105 1515
Original Composite Data 100 2.5e-06 6.2e-05 0.31 35 20 25
Original Composite Data 200 2e-05 0.00016 0.09 265 140 195
Original Composite Data 400 6.9e-05 0.00031 0.01 925 495 675
Table 3: Comparison of study-level and unadjusted composite swab sampling relative abundance to wastewater relative abundance at 1% weekly incidence.
Sample Type Sample Size Median IQR Zero Count Fraction Fold difference vs Rothman Fold difference vs Crits-Christoph Fold difference vs Average
Original Composite Data 50 0 1.1e-05 0.56 0 0 0
Original Composite Data 75 3.6e-07 3.3e-05 0.42 5 5 5
Adjusted Composite Data 50 0 7.8e-06 0.56 0 0 0
Adjusted Composite Data 75 2.4e-07 1.9e-05 0.42 5 0 0
Table 4: Comparison of swab sampling relative abundance to wastewater relative abundance at 1% weekly incidence, assuming swab sample sizes of 50 and 75.

Footnotes

  1. Rodriguez et al. (2021) generously shared qPCR and MGS data. We also thank Lu et al. (2021) and Mostafa et al. (2020) for answering questions about methodological details.↩︎

  2. At pool sizes of 75 swabs, and 50 swabs respectively, 40%, and 54% of pools contain no SARS-CoV-2 at 1% weekly incidence (Table 4, Figure 9).↩︎

  3. 3 days approximates the doubling time of wild-type SARS-CoV-2 at the beginning of the pandemic (Pellis et al. 2021). Assuming simple exponential growth in the number of cases, the current prevalence given a weekly incidence I is equal to P = I \times \frac{W}{W-1}, where W is the disease’s weekly growth rate. Assuming the 3 day doubling time, the weekly growth rate in this model is 5.040; at 1% weekly incidence, this gives a current prevalence of 1.248%.↩︎

  4. For all swab sampling studies, we analyze samples that test positive in qPCR measurements, including qPCR-positive samples that have a SARS-CoV-2 relative abundance of 0.0.↩︎

  5. They also ran MGS on samples of thirty individuals who tested negative to identify the underlying pathogen.↩︎

  6. In qPCR, a fluorescently tagged genetic target sequence undergoes repeated amplification cycles (one cycle is ~one doubling). The fluorescent signal is measured after each amplification cycle. The cycle threshold is defined as the number of cycles required for the signal to become measurable. A low CT value thus indicates the target sequence was present in higher quantities in the initial sample, as it took fewer amplification cycles for the sequence to become detectable. Conversely, a higher CT value suggests a lower initial concentration of the target sequence. See here for additional information.↩︎

  7. Results for sampling with pools of 50 and 75 swabs can be found in Table 4.↩︎

  8. This approach is discussed further in our previous preprint on wastewater.↩︎

  9. Any detection algorithm will require the presence of reads originating from the virus of interest. Certain methods—such as mapping to a known reference—may detect a pathogen from just a few reads, while less targeted methods may require far more. Here, we do not choose any specific detection method but rather simply model a virus as “detected” when the cumulative viral reads exceed a threshold of 100.↩︎

  10. Sequencing costs found online are doubled to account for library preparation and personnel costs. I.e, the cost of a 25M read MiSeq v3 run is doubled from $1,880 to $3,760. Similarly, a NovaSeq 6000 run with an S4 flow cell costs around $25,500 for 10B read pairs, which gives a total cost of $55,000 per 10B reads.↩︎

  11. This is because a lower weekly growth rate results in higher prevalence when weekly incidence reaches 1% (see footnote 3).↩︎

  12. In comparing the performance of different sample types (e.g., NP swab vs nasal swab), any positive sample was treated as ground truth (there is thus no possibility of having false positives).↩︎