# Process FCS files and Create AnnData Object

## Introduction

In this short tutorial we show how to process fcs files and create
an [anndata object](https://anndata.readthedocs.io/en/latest/). The fcs
files are from the publication _Data-driven phenotypic dissection of AML
reveals progenitor-like cells that correlate with prognosis
(10.1016/j.cell.2015.05.047)_. To facilitate this tutorial we deposited
the required files on
[figshare](https://figshare.com/account/home#/projects/140062).

## Setup

    knitr::opts_chunk$set(message = FALSE, warning = FALSE)
    pkgs = c('flowCore', 'anndata', 'tidyverse', 'magrittr', 'httr', 'reticulate')
    for(i in pkgs){
      suppressPackageStartupMessages(library(i, character.only = TRUE))
    }

## Define the python interpreter to use

Since the AnnData object is a python data structure we need to setup our
python interpreter first. Either you can setup a new environment or use
the one we use for the scQUEST package ([installation guide for
scQUEST](https://ai4scr.github.io/scQUEST/installation.html#package-installation)).
Important is, that the anndata versions of the environment you use here
is the same as in the environment in which you want to use the created
anndata object.

To find the path to the python interpreter run the following snipped in
the terminal

    conda activate scQUEST
    which python
    # /usr/local/Caskroom/miniconda/base/envs/scQUEST/bin/python

Now point `reticulate` to the interpreter you want to use.

    reticulate::use_python('/usr/local/Caskroom/miniconda/base/envs/scQUEST/bin/python')

## Request for figshare api

Here we setup the http request to download the fcs files

    # request for figshare api
    project_id = 140062
    articel_id = 19867114
    PATH = '~/.scQUEST/Levine_CyTOF_AML_AnnotatedCellTypes/'
    REQUEST = paste0('https://api.figshare.com/v2/articles/', articel_id, '/files')

## Helpers

Here we define some helper functions that facilitate mapping of
attributes and the download of the files.

    # helpers
    mapper = function(key, df){
      mapping= list()
      cols = colnames(df)
      cols = cols[cols != key]
      for(i in cols){
        m = map(df[[i]], ~ .x)
        names(m) = df[[key]]
        mapping[[i]] = m
      }
      return (mapping)
    }

    download_files = function(path = PATH, request = REQUEST){
      path = path.expand(path)
      dir.create(path, showWarnings = TRUE, recursive = TRUE, mode = "0777")

      resp = GET(request)
      stop_for_status(resp)

      files = content(resp, 'parsed')
      files = map(files, function(x){
        url = x$download_url
        fpath = paste0(path, x$name)
        if(!file.exists(fpath)){
          download.file(url, fpath)
          }
        return(c(url, fpath))
      })
    }

## Download and mappings

In a next step we download the files form figshare to `PATH`.
Furthermore, we provide some files that provide some more information
regarding the channel names and the sample_ids. For example
`rename.channel` indicates for what this channel was used and what kind
of marker it represents.

    # download files
    files = download_files(PATH, REQUEST)
    files = unlist(map(files, ~.x[2]))
    files.fcs = files[grepl('.fcs$', files)]
    files.csv = files[grepl('.csv$', files)]

    files.sampleID = files.csv[grepl('sampleID', files.csv)]
    files.channels = files.csv[grepl('channels', files.csv)]

    # mappings
    rename.sample = readr::read_csv(files.sampleID)
    map.sample = mapper('sample_id', rename.sample)

    rename.channel = readr::read_csv(files.channels)
    map.channel = mapper('channel', rename.channel)

## Process the fcs files

Now we are ready to process the actuall FCS files. We loop over all fcs
files and construct the attributes we later use to generated the anndata
object (`X`, `OBS`, `VAR`).

    # load FCS file
    X = NULL
    OBS = NULL
    VAR = NULL
    fcs.header = list()

    f.count = 0
    for(f in files.fcs){
      f.count = f.count + 1
      f.base =  basename(f)
      cat(f.count, '/', length(files.fcs), ' reading in file ', f.base, '\n')

      header = read.FCSheader(f)
      header.tbl = tibble(keyword = names(header[[1]]), value=header[[1]])

      fcs <- read.FCS(f)
      x = exprs(fcs)

      obs = list(fcs_file = f.base)
      for(i in names(map.sample)){
        obs[[i]] = map.sample[[i]][[f.base]]
      }

      obs = do.call(tibble, obs)
      obs %<>% slice(rep(1, each = dim(x)[1]))

      var = parameters(fcs)@data %>% select(-range, -minRange, -maxRange)

      X = rbind(X,x)
      OBS = rbind(OBS, obs)

      if(is.null(VAR)) VAR = var
      else stopifnot(all(VAR == var))

      stopifnot(!f %in% names(fcs.header))
      fcs.header[[f.base]] = header.tbl
    }

    ## 1 / 14  reading in file  2011-08-19-AML08-f.A_cct_subtracted_normalized_CD4 T cells_H1.fcs
    ## 2 / 14  reading in file  2011-08-19-AML08-f.A_cct_subtracted_normalized_CD8 T cells_H1.fcs
    ## 3 / 14  reading in file  2011-08-19-AML08-f.A_cct_subtracted_normalized_CD16- NK cells_H1.fcs
    ## 4 / 14  reading in file  2011-08-19-AML08-f.A_cct_subtracted_normalized_CD16+ NK cells_H1.fcs
    ## 5 / 14  reading in file  2011-08-19-AML08-f.A_cct_subtracted_normalized_Mature B cells_H1.fcs
    ## 6 / 14  reading in file  2011-08-19-AML08-f.A_cct_subtracted_normalized_Monocytes_H1.fcs
    ## 7 / 14  reading in file  2011-08-19-AML08-f.A_cct_subtracted_normalized_Pre B cells_H1.fcs
    ## 8 / 14  reading in file  2011-08-25-AML09-a.A_cct_subtracted_normalized_CD4 T cells_H2.fcs
    ## 9 / 14  reading in file  2011-08-25-AML09-a.A_cct_subtracted_normalized_CD8 T cells_H2.fcs
    ## 10 / 14  reading in file  2011-08-25-AML09-a.A_cct_subtracted_normalized_CD16- NK cells_H2.fcs
    ## 11 / 14  reading in file  2011-08-25-AML09-a.A_cct_subtracted_normalized_CD16+ NK cells_H2.fcs
    ## 12 / 14  reading in file  2011-08-25-AML09-a.A_cct_subtracted_normalized_Mature B cells_H2.fcs
    ## 13 / 14  reading in file  2011-08-25-AML09-a.A_cct_subtracted_normalized_Monocytes_H2.fcs
    ## 14 / 14  reading in file  2011-08-25-AML09-a.A_cct_subtracted_normalized_Pre B cells_H2.fcs

    # tidy data
    VAR %<>% rename(channel = name, marker = desc)
    for(i in names(map.channel)){
     VAR %<>% mutate("{i}" := unlist(map.channel[[i]][channel]))
    }

    # create AnnData object
    ad = AnnData(
      X = X,
      var = VAR,
      obs = OBS,
      uns = list(
        fcs_header = fcs.header
      )
    )

    # save AnnData object
    path_output = path.expand(paste0(PATH, 'ad_annotated_cell_types.h5ad'))
    anndata::write_h5ad(ad, path_output)

## Load AnnData object in python

We now created the AnnData object and can use it in our python project

    import anndata
    from pathlib import Path

    fpath = Path(r.path_output)
    assert fpath.is_file()
    ad = anndata.read_h5ad(fpath)
    print(ad)

    ## AnnData object with n_obs × n_vars = 96381 × 39
    ##     obs: 'fcs_file', 'celltype', 'complexcelltype', 'patient', 'ncells'
    ##     var: 'channel', 'marker', 'usedformanualannotation', 'usedforPhenoGraphclustering'
    ##     uns: 'fcs_header'