From 2f636c4a4289bceba9a8b5740b370238325a44cb Mon Sep 17 00:00:00 2001 From: Tom White Date: Tue, 24 Nov 2020 09:44:29 +0000 Subject: [PATCH] Add docs for windowing and cohorts --- docs/getting_started.rst | 82 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) diff --git a/docs/getting_started.rst b/docs/getting_started.rst index 6cb3a708b..09df9573d 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -241,7 +241,86 @@ shows how it can be used in the context of doing something simple like counting # This how many sgkit functions handle missing data internally: sg.variant_stats(ds).variant_n_het.item(0) +Windowing +~~~~~~~~~ +It is common to compute statistics in windows along the genome. Some :ref:`api_methods` in sgkit +are "windowing aware" and will compute values for windows defined in a dataset. If no windows +are defined then the values will typically be computed for each variant. It is therefore +important to define windows *before* computing statistics on a dataset. + +Windows are intervals that span the ``variants`` dimension in a dataset, and they are defined +using the :func:`sgkit.window` function. Currently only fixed-sized windows are supported, by +providing the ``size`` argument, which refers to the number of variants in each window. An +optional ``step`` argument may be provided to control the spacing between windows. By default, +it is the same as the ``size``, giving contiguous windows. + +This example shows the effect of computing the diversity statistic: first with no windows defined, +then with windows. + +.. ipython:: python + :okwarning: + + import sgkit as sg + import xarray as xr + ds = sg.simulate_genotype_call_dataset(n_variant=100, n_sample=50) + + # Define a single cohort for all samples + ds["sample_cohort"] = xr.DataArray(np.full(ds.dims['samples'], 0), dims="samples") + + # The diversity statistic is computed for every variant since no windows are defined + sg.diversity(ds, merge=False) + + # Define windows of size 20 variants. This creates a new dimension called `windows`, and + # some new variables for internal use. + ds = sg.window(ds, 20) + + # The diversity statistic is now computed for every window + sg.diversity(ds, merge=False) + +Cohorts +~~~~~~~ + +During analysis we often want to be able to group samples into populations, and compute statistics +based on these groups. Groups of samples are referred to as *cohorts* in sgkit. + +Cohorts are defined by a mapping from samples to cohort index. The following example creates +a ``sample_cohort`` variable to group a dataset of ten samples into three cohorts. Note that first +value is ``-1``, which means the corresponding sample is not in any of the three cohorts, and +will be ignored when computing cohort statistics. + +.. ipython:: python + :okwarning: + + import sgkit as sg + import xarray as xr + ds = sg.simulate_genotype_call_dataset(n_variant=100, n_sample=10) + ds["sample_cohort"] = xr.DataArray(np.array([-1, 0, 1, 1, 1, 1, 0, 2, 2, 2]), dims="samples") + +Typically the ``sample_cohort`` variable is derived from a dataframe that has the sample/cohort +mapping. + +Cohort-level statistics can have repeated ``cohorts`` dimensions. :func:`sgkit.Fst`, for example, +produces statistics for *pairs* of cohorts, which is represented as a variable with dimensions +``(windows, cohorts_0, cohorts_1)``, making it possible to read off the value of the statistic +for any pair of cohorts. + +It's convenient to name cohorts, to avoid errors that can occur when using index values. This +example shows how to give cohorts names. + +.. ipython:: python + :okwarning: + + ds = sg.window(ds, 20) + ds = sg.Fst(ds) + + cohort_names = ["Africa", "Asia", "Europe"] + ds = ds.assign_coords({"cohorts_0": cohort_names, "cohorts_1": cohort_names}) + ds.stat_Fst.sel(cohorts_0="Africa", cohorts_1="Asia").values + +Methods that work with cohorts will, by default, operate over all cohorts at once. Sometimes +however you might only want to run the computation for a subset of cohorts, in which case you can +explicitly specify the cohorts when calling the function. Chaining operations ~~~~~~~~~~~~~~~~~~~ @@ -265,12 +344,15 @@ Xarray and Pandas operations in a single pipeline: ds_qc.variant_call_rate.to_series().describe() # Build a pipeline that filters on call rate and computes Fst between two cohorts + # for windows of size 20 variants ( ds # Add call rate and other statistics .pipe(sg.variant_stats) # Apply filter to include variants present across > 80% of samples .pipe(lambda ds: ds.sel(variants=ds.variant_call_rate > .8)) + # Create windows of size 20 variants + .pipe(lambda ds: sg.window(ds, 20)) # Assign a "cohort" variable that splits samples into two groups .assign(sample_cohort=np.repeat([0, 1], ds.dims['samples'] // 2)) # Compute Fst between the groups