yang lu
unread,May 11, 2025, 5:13:30 AMMay 11Sign in to reply to author
Sign in to forward
You do not have permission to delete messages in this group
Either email addresses are anonymous for this group or you need the view member email addresses permission to view the original message
to slim-discuss
Extracting ancestral tracts from admixed population in two population admixture model by slim and tskit
Hello everyone! I'm encountering an issue with ancestral tract length analysis from tree sequences simulated using SLiM. While both my SLiM model and Python analysis pipeline run without errors, the extracted average ancestral tract lengths for both populations show significant discrepancies from theoretical expectations. For instance, under an admixture event occurring G = 200 generations ago with proportions m₁ = 0.4 and m₂ = 0.6, the expected average ancestral tract lengths should follow L = 1 / [G × (1 − mᵢ)]. However, the values simulated from my Python pipeline deviate significantly from this theoretical expectation.
And my code is written below:
SLiM:
initialize() {
setSeed(SEED);
initializeTreeSeq();
initializeMutationRate(MU);
initializeMutationType("m1", 0.5, "f", 0.0);
initializeMutationType("m2", 0.5, "f", 2*SEL1);
initializeGenomicElementType("g1", m1, 1.0);
initializeGenomicElement(g1, 0, asInteger(L));
initializeRecombinationRate(R);
//defineConstant("G",GEN);
defineConstant("tree_seq_path",TREE_TMP_PATH+"tmp_tree_"+SEED+".trees");
defineConstant("vcf_path_ref",VCF_PATH+"tmp_vcf_ref_"+SEED+".vcf");
defineConstant("vcf_path_adm",VCF_PATH+"tmp_vcf_adm_"+SEED+".vcf");
//defineConstant("Burn-start",10*Ne+1)
}
1 early() {
cat("start simulation\n");
sim.addSubpop("p1", Ne);
sim.addSubpop("p2", Ne);
}
1: late(){
catn(sim.cycle);
}
10*Ne late(){
sim.treeSeqOutput(tree_seq_path);
target=sample(p1.genomes, 1);
target.addNewDrawnMutation(m2, asInteger(L/2));
}
10*Ne: late(){
muts = sim.mutationsOfType(m2);
counts = p1.genomes.countOfMutationsOfType(m2);
freq = mean(counts > 0);
if (size(muts) !=1)
{
cat("RESTART "+sim.cycle+"\n");
catn("-------------");
sim.readFromPopulationFile(tree_seq_path);
setSeed(getSeed() + 1);
target = sample(p1.genomes, 1);
target.addNewDrawnMutation(m2, asInteger(L/2));}
else{
if (freq>FREQ_LB)
{ times=sim.cycle+1;
community.rescheduleScriptBlock(s1, start=times, end=times);
community.rescheduleScriptBlock(s2, start=times, end=times);
community.rescheduleScriptBlock(s3, start=times+GEN, end=times+GEN);
//community.rescheduleScriptBlock(s4, start=times+GEN, end=times+GEN);
community.deregisterScriptBlock(self);
}
}
}
s1 10000000 early(){
sim.addSubpop("p3", Ne);
catn("Start admixture at "+sim.cycle);
p3.setMigrationRates(c(p1, p2),c(MIG_PROP, 1-MIG_PROP));
}
s2 100000000 late(){
m2muts = sim.mutationsOfType(m2);
m2muts.setSelectionCoeff(2*SEL2);
p3.setMigrationRates(c(p1, p2),c(0,0));
}
s3 1000000000 late(){
catn(tree_seq_path);
c(p1.genomes[0:(NUMSIMU-1)],p2.genomes[0:(NUMSIMU-1)]).outputVCF(vcf_path_ref,outputMultiallelics=F);
//c(p3.genomes[0:NUMSIMU]).outputVCF(vcf_path_adm,outputMultiallelics=F);
p1.setSubpopulationSize(0);
p2.setSubpopulationSize(0);
//p3.setSubpopulationSize(NUMSIMU);
sim.treeSeqOutput(tree_seq_path);
//sim.simulationFinished();
cat("end simulation\n");
}
I set a very weak selection to approximate the neutral condition
Python:
def find_anc_node(tree,node_id):
## find the "root" node (may not fully coelescent when the sample are from different ancestry)
current_node = node_id
while current_node != tskit.NULL:
parent = tree.parent(current_node)
if parent == -1:
break
current_node = parent
return current_node
seeder=123
rec_rate=1e-8
Ne=500
num_simu=100
num_simu_ref=int(2*num_simu) #the sample number of reference hap
num_simu_adm=num_simu #the sample number of reference diploids
freq_lb=0.995
mig_prop=0.4
mu=1e-8
sel1=0.01
sel2=0.0000000001 # very weak selection nearly neutral
L=1e8
Gen=200
vcf_save_path_prefix="/data/yanglu/0503compare_methods/vcfpath/"
tree_tmp_path_prefix="/data/yanglu/0503compare_methods/treetmp/"
slim_path="/home/luyang/0503compare_methods/slim_2pop.slim"
##SLiM simulation
slim_commands=f"slim -d L={int(L)} -d SEED={seeder} -d NUMSIMU={num_simu_ref} -d FREQ_LB={freq_lb} -d MIG_PROP={mig_prop} -d MU={mu} -d SEL1={sel1} -d SEL2={sel2} -d Ne={Ne} -d R={rec_rate} -d GEN={Gen} -d \"VCF_PATH='{vcf_save_path_prefix}'\" -d \"TREE_TMP_PATH='{tree_tmp_path_prefix}'\" {slim_path}"
# subprocess.run(slim_commands, shell=True, capture_output=True,text=True)
subprocess.run(slim_commands, shell=True)
## Extract the subtree of admixed population of sample
tree_total_path=f"/data/yanglu/0503compare_methods/treetmp/tmp_tree_{seeder}.trees"
ts = tskit.load(tree_total_path)
adm_vcf_path=f"/data/yanglu/0503compare_methods/vcfpath/tmp_vcf_adm_{seeder}.vcf"
sampled_indivs = random.sample(range(ts.num_individuals), num_simu_adm)
sampled_nodes = []
for ind in sampled_indivs:
sampled_nodes.extend(ts.individual(ind).nodes)
# print(sampled_indivs) #[5, 16, 12, 43]
# print(sampled_nodes) #[10, 11, 32, 33, 24, 25, 86, 87]
ancts=ts.simplify(sampled_nodes,filter_populations=False,keep_input_roots=True)
## The treeinfo table records the ancestral origins of different genomic tracts along admixed population chromosomes."
tree_res=[]
for node in ancts.samples():
for tree in ancts.trees():
interval=[tree.interval.left,tree.interval.right]
anc_node=find_anc_node(tree,node)
anc=tree.population(anc_node)
# print(tree.index,node,anc,tree.interval.left,tree.interval.right)
tree_res.append([
tree.index,
node,
anc,
tree.interval.left,
tree.interval.right
])
treeinfo = pd.DataFrame(tree_res, columns=["tree_id", "sam_node", "anc", "inter_left", "inter_right"])
treeinfo["inter_left"]=treeinfo["inter_left"]*rec_rate
treeinfo["inter_right"]=treeinfo["inter_right"]*rec_rate
treeinfo["length"]=treeinfo["inter_right"]-treeinfo["inter_left"]
treeinfo.to_csv("/home/luyang/0503compare_methods/treeinfo.csv",index=False,sep="\t")
treeinfo.drop(["length"], axis=1, inplace=True)
## Merge the adajacent interval with same ancestry
merge_anc=pd.DataFrame([])
for hap in treeinfo["sam_node"].unique():
df=treeinfo.loc[treeinfo["sam_node"]==hap,].reset_index(drop=True)
df['group'] = (df['anc'] != df['anc'].shift()).cumsum()
result = df.groupby(['anc', 'group']).agg({'sam_node': 'first','inter_left': 'first','inter_right': 'last',"anc":"first"}).reset_index(drop=True)
merge_anc=pd.concat([merge_anc,result],axis=0)
merge_anc=merge_anc.reset_index(drop=True)
# merge_anc = pd.DataFrame(merge_anc, columns=["hap", "inter_left", "inter_right", "target_anc"])
merge_anc["length"]=merge_anc["inter_right"]-merge_anc["inter_left"]
##calculate the average length ancestral tracts
print(merge_anc)
print(merge_anc.groupby(["anc"]).mean())## quite different from the theoratical results
Could there be any issues with my approach to extracting ancestral tracts? Thanks in advance