
import math

def survival_probability(hours, mtbf):
    # Returns the probability that a digital media unit with a given
    # MTBF will survive a given time period (in hours).  While MTBF is
    # not a particularly reliable figure, at least it's a starting
    # point.
    # http://www.t-cubed.com/faq_mtbf.htm
    # http://en.wikipedia.org/wiki/Failure_rate
    return math.exp(-float(hours) / mtbf)


def fact(n):
    """Compute a factorial.

    fact(n) = n! = n * (n - 1) * (n - 2) * ... * 1
    """
    res = 1
    while n > 1:
        res *= n
        n -=1
    return res


def p_stripe(p_unit, u, L):
    """Compute the probability of retaining a stripe of digital media.

    'p_unit' is the probability that any single media unit will
    survive.  'u' is the number of media units in the stripe,
    including protection stripes.  'L' is the number of media unit
    losses that still result in the data being retained.

    After the time period has elapsed, there are 2^u possible outcomes
    for the stripe, each outcome representing a specific permutation
    of survived and failed media units.  To compute the probability of
    retaining enough data to recover the whole set, this function
    computes the sum of the probability of all successful outcomes.

    Each possible outcome has a different probability based on the
    probabilities of the corresponding outcomes in each unit.  The
    probability that a single unit will survive is p_unit and the
    probability that it will fail is (1 - p_unit).  Assign 'x' to mean
    the number of devices that fail in each outcome.  In each outcome,
    (u - x) units survive.  The probability of each permutation of
    surviving and failed media units is the product of the probability
    of each device having a particular outcome.  p0(x) gives the
    probability of one permutation, where x is the number of failed
    media units.

      p0(x) = p_unit ^ (u - x)  *  (1 - p_unit) ^ x

    All outcomes with the same number of failed media units have the
    same probability.  To simplify the summation, compute the number
    of possible outcomes with (x) failures.  The number of possible
    outcomes with (x) failures can be expressed as the number of
    combinations of (u) devices taken (x) at a time, or:

      C(x) = u! / ((u - x)! * x!)

    Multiply C(x) by p0(x) to compute the probability of exactly (x)
    devices failing.  The probability of all tolerable outcomes is the
    sum of the probability of (0, 1, .. L) failed devices.  Here is
    the equation written in ASCII art, using 'E' to represent
    summation.  Use a monospace font to read it:

                  L     u!          (u - x)             x
      p_stripe =  E  -------- p_unit        (1 - p_unit)
                 x=0 (u-x)!x!

    """
    # total is the sum of the probability of all outcomes that retain the data.
    total = 0.0
    # x progresses from 0 to the number of tolerable losses.
    for x in range(0, L + 1):
        # p0 is the probability of one permutation.
        p0 = (p_unit ** (u - x)) * ((1 - p_unit) ** x)
        # C is the number of failure combinations whose probability match p0.
        C = fact(u) / (fact(u - x) * fact(x))
        total += C * p0
    return total


def main():
    mtbf = 1000000.0  # typical hard drive MTBF
    r = survival_probability(365.24 * 24, mtbf)
    print 'media unit survival probability after 1 year with MTBF=%d: %f' % (
        mtbf, r)
    r = survival_probability(30.5 * 24, mtbf)
    print 'media unit survival probability after 1 month with MTBF=%d: %f' % (
        mtbf, r)

    # Now set the reliability, regardless of published MTBF numbers.
    # This number should come directly from observation.  Just buy a
    # lot of media units, count how many survive 1 month (or whatever
    # refresh period you choose), and divide that number by the number
    # of media units you bought.  You'll get a fraction between 0 and
    # 1.  Repeat the measurement for at least three months (since
    # three months is a typical burn-in period.)  Choose the lowest of
    # all measurements.
    r = 0.98  # faked for now
    print ('survival probability after 1 month, '
           'based on observation (faked for now):'), r
    print 'unprotected volume (can lose 0):', p_stripe(r, 1, 0)
    print 'RAID 0 with 2 units (can lose 0):', p_stripe(r, 2, 0)
    print 'RAID 0 with 4 units (can lose 0):', p_stripe(r, 4, 0)
    print 'RAID 1 with 2 units (can lose 1):', p_stripe(r, 2, 1)
    print 'RAID 1 with 3 units (can lose 2):', p_stripe(r, 3, 2)
    print 'RAID 5 with 8 units (can lose 1):', p_stripe(r, 8, 1)
    print 'RAID 5 with 4 units (can lose 1):', p_stripe(r, 4, 1)
    print 'RAID 6 with 8 units (can lose 2):', p_stripe(r, 8, 2)
    print 'RAID 6 with 4 units (can lose 2):', p_stripe(r, 4, 2)
    # Use more extensive forward error correction to further improve
    # data safety.
    print 'FEC, 4 data, 3 protection (can lose 3):', p_stripe(r, 7, 3)
    print 'FEC, 4 data, 4 protection (can lose 4):', p_stripe(r, 8, 4)
    print 'FEC, 12 data, 3 protection (can lose 3):', p_stripe(r, 15, 3)
    print 'FEC, 20 data, 10 protection (can lose 10):', p_stripe(r, 30, 10)
    print 'FEC, 20 data, 20 protection (can lose 20):', p_stripe(r, 40, 20)

if __name__ == '__main__':
    main()

