library(future)
library(Signac)
library(Seurat)
library(dplyr)
library(BiocParallel)
library(argparse)
parser = ArgumentParser()
parser$add_argument("--gex_matrix_dir", help="seeksoultools. gex. step3/raw_feature_bc_matrix")
parser$add_argument("--atac_matrix_dir", help="seeksoultools. atac. step3/filter_peaks_bc_matrix")
parser$add_argument("--fragpath", help="seeksoultools. atac. step3/asample_fragments.tsv.gz")
parser$add_argument("--outdir", help="outdir")
parser$add_argument("--core", help="Parallel running cores")
parser$add_argument("--species", help="human or mouse")
parser$add_argument("--anno_rds", help="Anno_EnsDb_Hsapiens_v86.rds or Anno_EnsDb_Mmusculus_v79.rds")
parser$add_argument("--memory", help="Memory usage")
args <- parser$parse_args()

species=args$species
if (species == "human") {
    library(BSgenome.Hsapiens.UCSC.hg38)
} else if (species == "mouse") {
    library(BSgenome.Mmusculus.UCSC.mm10)
} else {
    stop("Not human or mouse, please enter: 'human' or 'mouse'")
}

outdir=args$outdir
gex_matrix_dir=args$gex_matrix_dir
atac_matrix_dir=args$atac_matrix_dir
fragpath=args$fragpath
core=args$core
anno_rds=args$anno_rds
memory=args$memory

dir.create(outdir, showWarnings = FALSE, recursive = TRUE)
setwd(outdir)

# 多线程并行
options(future.globals.maxSize = as.integer(memory) * 1024 ^ 3)
plan("multicore", workers = as.integer(core))
plan()

gex_data <- Read10X(data.dir = gex_matrix_dir)
gexobj <- CreateSeuratObject(counts = gex_data, assay = "RNA")
cat("------------gex------------------------")
gexobj
atac_data <- Read10X(data.dir = atac_matrix_dir)
atacobj <- CreateSeuratObject(counts = atac_data, assay = "ATAC")
cat("------------atac------------------------")
atacobj
jointcb <- colnames(atacobj)
obj <- subset(gexobj, cells = jointcb)
cat("------------joint------------------------")
obj

annotation <- readRDS(anno_rds)
seqlevels(annotation) <- paste0('chr', seqlevels(annotation))

obj[["ATAC"]] <- CreateChromatinAssay(counts = atac_data, sep = c(":", "-"), fragments = fragpath, annotation = annotation)


# filter
DefaultAssay(obj) <- "ATAC"
features.keep <- as.character(seqnames(granges(obj))) %in% standardChromosomes(granges(obj))
obj.filter <- obj[features.keep, ]
obj[["ATAC"]] <- obj.filter[["ATAC"]]
saveRDS(obj,file=paste0(outdir,'/filter_peaks_bc_matrix.rds'))


# RNA标准化
DefaultAssay(obj) <- "RNA"
obj <- SCTransform(obj)
obj <- RunPCA(obj)
# RNA降维聚类
obj <- FindNeighbors(obj, dims = 1:30)
obj <- FindClusters(obj, resolution = 0.8)
obj <- RunTSNE(obj, dims = 1:30,check_duplicates = FALSE)
obj <- RunUMAP(obj, dims = 1:30)
# RNA tsne 坐标
tsne_loci <- as.data.frame(Embeddings(obj, reduction='tsne'))
tsne_loci <- cbind(tsne_loci, obj[[]])
write.table(tsne_loci, file='gex_tsne_umi.xls', 
            row.names=TRUE, 
            col.names=TRUE, 
            sep="\t", 
            quote=FALSE)

# diff table
features_df <- read.table(file.path(gex_matrix_dir, 'features.tsv.gz'), sep="\t")
names(features_df)[1:2] <- c('Ensembl', 'gene')
obj.markers <- FindAllMarkers(obj, min.pct = 0.1, logfc.threshold = 0.25, only.pos = TRUE) %>%
    left_join(features_df, by='gene') %>% relocate(Ensembl, gene)
# obj.markers[is.na(obj.markers)] <- 'UNDEFINED'
obj.markers$Ensembl[is.na(obj.markers$Ensembl)] <- "na"
write.table(obj.markers, file='gex_FindAllMarkers.xls', row.names=FALSE, sep="\t", quote=FALSE)


# 切换ATAC 计算 NS和 TSS
DefaultAssay(obj) <- "ATAC"
obj <- NucleosomeSignal(obj)
obj <- TSSEnrichment(obj)

# ATAC标准化，降维聚类
obj <- FindTopFeatures(obj, min.cutoff = 5)
obj <- RunTFIDF(obj)
obj <- RunSVD(obj)
obj <- RunUMAP(object = obj, reduction = 'lsi', dims = 2:30, reduction.name = "umapATAC", reduction.key = "umapATAC_")
obj <- RunTSNE(obj, reduction = 'lsi', dims = 2:30, reduction.name = "tsneATAC", reduction.key = "tsneATAC_")
obj <- FindNeighbors(object = obj, reduction = 'lsi', dims = 2:30)
obj <- FindClusters(object = obj, verbose = FALSE, algorithm = 3)
# ATAC tsne 坐标
tsne_loci <- as.data.frame(Embeddings(obj, reduction='tsneATAC'))
tsne_loci <- cbind(tsne_loci, obj[[]])
write.table(tsne_loci, file='atac_tsne_umi.xls', 
            row.names=TRUE, 
            col.names=TRUE, 
            sep="\t", 
            quote=FALSE)

## WNN
DefaultAssay(obj) <- "SCT"
obj <- FindMultiModalNeighbors(
  object = obj,
  reduction.list = list("pca", "lsi"), 
  dims.list = list(1:30, 2:30),
  modality.weight.name = "RNA.weight",
  verbose = TRUE
)
obj <- RunUMAP(
  object = obj,
  nn.name = "weighted.nn",
  reduction.name = "UMAP_WNN",
  assay = "RNA",
  verbose = TRUE
)


# Linking peaks to genes
DefaultAssay(obj) <- "ATAC"
if (species == "human") {
    obj <- RegionStats(obj, genome = BSgenome.Hsapiens.UCSC.hg38)
} else {
    obj <- RegionStats(obj, genome = BSgenome.Mmusculus.UCSC.mm10)
}
obj <- LinkPeaks(object = obj,peak.assay = "ATAC",expression.assay = "SCT")

saveRDS(obj,file=paste0(outdir,'/joint_peak_link_gene.rds'))

# count link
linked_peaks <- Links(obj)
total_links <- length(linked_peaks)
# count link to gene
linked_genes <- unique(linked_peaks$gene)
total_linked_genes <- length(linked_genes)
# count link to peaks
linked_peaks_names <- unique(linked_peaks$peak)
total_linked_peaks <- length(linked_peaks_names)
# output result
cat("总链接数量:", total_links, "\n")
cat("链接到基因的数量:", total_linked_genes, "\n")
cat("链接到 peaks 的数量:", total_linked_peaks, "\n")

write.table(linked_peaks, file = paste0(outdir,"/linked_feature.xls"), sep = "\t", quote = FALSE, row.names = FALSE)


