Confidence interval for sample size for urn problem (sampling without replacement)

by Simon Woodward   Last Updated June 14, 2018 22:19 PM

I have a finite population (H) of animals with a known number of GPS collars (C).

If I take a random sample of (M) animals, the probability of a given number (X) of GPS collars being in the sample is calculated using a hypergeometric distribution. This is the classic "urn problem".

However, what if the number of animals sampled from the herd is unknown, and I can only see the number of GPS collars in the sample (X). How can I derive a 95% confidence interval for the number of animals in the sample (M)?



Answers 1


I figured it out using the hypergeometric distribution and Bayes equation.

library(tidyverse)
library(ggthemes)
library(gtools)

# example
n_cow_in_herd <- 300
n_cow_in_mob <- 30

# part 1 - how many collars do we need to ensure N in a selected mob?

# https://en.wikipedia.org/wiki/Urn_problem
# binomial = draw with replacement
# hypergeometric = draw without replacement
max_need_in_mob <- 6L 
max_collar_in_herd <- 100L
data <- vector("list", max_need_in_mob)
for (i in 1L:max_need_in_mob){
  data_i <- tibble(n_collar_in_herd=1L:max_collar_in_herd,
                   i_need_in_mob=i) %>%
    mutate(
      # pbinom(i,n,p) = prob of getting i or less in n trials each with p prob of success
      b_prob_i_or_more = 1-pbinom(i-1, n_cow_in_mob, n_collar_in_herd/n_cow_in_herd), 
      # phyper(i,a,b,n) = prob of getting i or less good eggs in n draws from basket with 'a' good and 'b' bad eggs
      h_prob_i_or_more = 1-phyper(i-1, n_collar_in_herd, n_cow_in_herd-n_collar_in_herd, n_cow_in_mob) 
    )
  data[[i]] <- data_i
}
data <- bind_rows(data) %>%
  arrange(i_need_in_mob, n_collar_in_herd) %>%
  mutate(i_need_in_mob=as.factor(i_need_in_mob))

# cumulative plot
redmono = c("#99000D", "#CB181D", "#EF3B2C", "#FB6A4A", "#FC9272", "#FCBBA1", "#FEE0D2", "#FFF5F0")
title <- paste("Probability of at least X collars in mob (", n_cow_in_herd, "cows in herd, ", n_cow_in_mob, " cows in mob).")
breaks <- c(0, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 1)
p1 <- ggplot() +
  theme_solarized() +
  labs(title=title, x="C collars in herd", y="") +
  guides(colour=guide_legend(title="X")) +
  geom_hline(yintercept=c(0.95), colour="darkgrey") +
  scale_y_continuous(breaks=breaks) +
  # geom_path(data=data, mapping=aes(x=n_collar_in_herd, y=b_prob_i_or_more, colour=i_need_in_mob), linetype=3) +
  geom_path(data=data, mapping=aes(x=n_collar_in_herd, y=h_prob_i_or_more, colour=i_need_in_mob)) 

print(p1)

# part 2 - confidence interval on how many cows in mob from number of collars?

# https://en.wikipedia.org/wiki/Binomial_proportion_confidence_interval (approximation poor at tails)
# Bayes: p(A|B) = p(B|A)*p(A)/p(B)
# p(sample size N|i collars) = p(i collars|sample size N) * p(sample size N prior) / p(i collars evidence)

n_collar_in_herd <- 150L
max_seen_in_mob <- 6L 
max_cow_in_mob <- 75L
prior_mean <- 30
prior_sd <- 10
data <- vector("list", max_need_in_mob)
for (i in 1L:max_seen_in_mob){
  data_i <- tibble(n_cow_in_mob=1L:max_cow_in_mob,
                   i_seen_in_mob=i) %>%
    mutate(
      prior_i = dnorm(n_cow_in_mob, prior_mean, prior_sd),
      prior_i_or_less = pnorm(n_cow_in_mob, 30, 10),
      # pbinom(i,n,p) = prob of getting i or less in n trials each with p prob of success
      b_prob_i = dbinom(i, n_cow_in_mob, n_collar_in_herd/n_cow_in_herd), 
      b_prob_i_or_less = pbinom(i, n_cow_in_mob, n_collar_in_herd/n_cow_in_herd), 
      # phyper(i,a,b,n) = prob of getting i or less good eggs in n draws from basket with 'a' good and 'b' bad eggs
      h_prob_i = dhyper(i, n_collar_in_herd, n_cow_in_herd-n_collar_in_herd, n_cow_in_mob),
      h_prob_i_or_less = 1-phyper(i-1, n_collar_in_herd, n_cow_in_herd-n_collar_in_herd, n_cow_in_mob),
      h_prob_i_adj = h_prob_i * prior_i,
      h_prob_i_or_less_adj = cumsum(h_prob_i_adj),
      h_prob_i_or_less_adj = h_prob_i_or_less_adj / max(h_prob_i_or_less_adj)
    )
  data[[i]] <- data_i
}
data <- bind_rows(data) %>%
  arrange(i_seen_in_mob, n_cow_in_mob) %>%
  mutate(i_seen_in_mob=as.factor(i_seen_in_mob))

# density plot
redmono = c("#99000D", "#CB181D", "#EF3B2C", "#FB6A4A", "#FC9272", "#FCBBA1", "#FEE0D2", "#FFF5F0")
title <- paste("Probability of \u2264 M cows in mob given X collars in mob (", n_cow_in_herd, "cows in herd, ", n_collar_in_herd, " collars in herd).")
subtitle <- paste("Dotted is without prior. Solid prior is Normal(", prior_mean, ",", prior_sd, ").")
breaks <- c(0, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 1)
p1 <- ggplot() +
  theme_solarized() +
  labs(title=title, subtitle=subtitle, x="M cows in mob", y="") +
  guides(colour=guide_legend(title="X")) +
  scale_y_continuous(breaks=breaks) +
  geom_hline(yintercept=c(0.05,0.95), colour="darkgrey") +
  geom_path(data=filter(data, i_seen_in_mob==1L), mapping=aes(x=n_cow_in_mob, y=prior_i_or_less), colour="darkgrey", size=1.5, linetype=2) +
  geom_path(data=data, mapping=aes(x=n_cow_in_mob, y=h_prob_i_or_less, colour=i_seen_in_mob), linetype=3) +
  geom_path(data=data, mapping=aes(x=n_cow_in_mob, y=h_prob_i_or_less_adj, colour=i_seen_in_mob)) 

print(p1)

enter image description here

enter image description here

Simon Woodward
Simon Woodward
June 14, 2018 21:45 PM

Related Questions


Confidence interval for a hypergeometric distribution

Updated November 06, 2017 20:19 PM

Sample mean vs sampling distribution mean

Updated June 08, 2017 21:19 PM

Margin of Error of Sample Variance

Updated August 05, 2015 18:08 PM