Two random variables being marginally Gaussian does not imply they are jointly Gaussian. Joint Gaussianity of a random vector \(\mathbf{X}\in\mathbb{R}^d\) means any linear combination of the components is Gaussian. The following is a counterexample: Let \(X, Z\sim \mathcal{N}(0,1)\). Define: \(Y = \begin{cases} Z, XZ\ge 0 \\ -Z, XZ\lt 0 \end{cases}\)
nmc = 10000
x = np.random.randn(nmc)
z = np.random.randn(nmc)
idx = x*z > 0
idx2 = x*z <= 0
y = np.zeros(nmc)
y[idx] = z[idx]
y[idx2] = -z[idx2]
plt.figure(1)
fig, ax = plt.subplots(1, 3, figsize=(20, 4))
nbins = 100
ax[0].hist(x, nbins, color="red"); ax[0].set_title("X")
ax[1].hist(y, nbins, color="blue"); ax[1].set_title("Y")
ax[2].scatter(x, y, color="purple", s=2.0); ax[2].set_title("Joint")
ax[2].set_xlabel("X"); ax[2].set_ylabel("Y")
The plot confirms that they are not jointly Gaussian despite being marginally Gaussian.
Gambler’s Ruin describes the inevitable loss a gambler faces when playing a game with fixed odds and finite resources. This concept is usually related to using a tree approach for pricing options under different scenarios of underlying.
In the code below, I play with data structures, mainly heaps and deques to achieve \(O(1)\) query of statistics from a data stream: max (or min), avearge, mode (most frequent item, with tie-breaking using the most recently processed point), and median. The maximum, mode and median seem to be non-trivial problems to implement efficiently, especially when one only intends to track \(O(k)\) most recent data.
from collections import defaultdict, deque
import heapq
class DataStream:
def __init__(self, k):
"""
Initialize a data stream object that maintains the max, average, mode, and median for
the k most recent elements.
:param k: size of the window for the sliding window statistics.
"""
self.k = k # Window size (k most recent data points)
self.data = deque() # Deque to store the sliding window of data
self.sum = 0 # Sum to maintain the average
self.max_data = deque() # Deque to store the maximum values in the current window
self.mode_tracker = SlidingWindowModeDeque(k) # Tracker for the mode
self.median_tracker = SlidingWindowMedian(k) # Tracker for the median
def push(self, x):
"""
Add a new element to the data stream and update the statistics.
:param x: new element to add to the stream.
"""
if len(self.data) == self.k:
# If the window is full, remove the oldest element
old = self.data.popleft()
# Update the sum for average calculation
self.sum -= old
# If the old element was the max, update the max deque
if self.max_data[0] == old:
self.max_data.popleft()
# Add the new element
self.data.append(x)
# Update the sum
self.sum += x
# Update the max deque
while self.max_data and self.max_data[-1] < x:
self.max_data.pop()
self.max_data.append(x)
# Update the mode tracker
self.mode_tracker.add_number(x)
# Update the median tracker
self.median_tracker.push(x)
def get_average(self):
"""
Return the average of the k most recent data points.
:return: the average of the sliding window, rounded to 2 decimal places.
"""
return round(self.sum / len(self.data), 2)
def get_max(self):
"""
Return the maximum value of the k most recent data points.
:return: the maximum value in the sliding window, rounded to 2 decimal places.
"""
return round(self.max_data[0], 2)
def get_mode(self):
"""
Return the mode (most frequent element) in the k most recent data points.
:return: the mode of the sliding window.
"""
return self.mode_tracker.get_mode()
def get_median(self):
"""
Return the median of the k most recent data points.
:return: the median of the sliding window.
"""
return round(self.median_tracker.get_median(), 2)
class SlidingWindowModeDeque:
def __init__(self, k):
self.k = k # Size of the sliding window
self.window = deque() # Deque to hold the sliding window elements
self.freq = defaultdict(int) # Frequency map for elements in the window
self.mode_deque = deque() # Deque to track the mode candidates
def add_number(self, num):
if len(self.window) == self.k:
old = self.window.popleft()
# If old was the mode, update
if self.mode_deque and self.mode_deque[0] == old:
self.mode_deque.popleft()
# Update frequency
if self.freq[old] > 0:
self.freq[old] -= 1
if self.freq[old] == 0:
del self.freq[old]
self.window.append(num) # Add new element to the sliding window
self.freq[num] += 1 # Update its frequency
# Maintain the deque in descending order of frequency
while self.mode_deque and self.freq[self.mode_deque[-1]] <= self.freq[num]:
self.mode_deque.pop()
self.mode_deque.append(num)
def get_mode(self):
# The current mode is the element at the front of the deque
return self.mode_deque[0]
class SlidingWindowMedian:
def __init__(self, k):
"""
Initialize the SlidingWindowMedian with window size k.
"""
self.k = k
self.max_heap = [] # Max heap for the smaller half of numbers
self.min_heap = [] # Min heap for the larger half of numbers
self.to_remove = defaultdict(int) # Tracks numbers to remove when they fall out of the window
self.current_size = 0 # Tracks the number of elements in the window
def _balance_heaps(self):
"""
Balance the two heaps so that the sizes differ by at most 1.
"""
if len(self.max_heap) > len(self.min_heap) + 1:
heapq.heappush(self.min_heap, -heapq.heappop(self.max_heap))
elif len(self.min_heap) > len(self.max_heap):
heapq.heappush(self.max_heap, -heapq.heappop(self.min_heap))
def _clean_heap(self, heap):
"""
Clean up the heap by removing any numbers that should no longer be there.
"""
while heap and self.to_remove[heap[0]]:
num = heapq.heappop(heap)
self.to_remove[num] -= 1
def push(self, num):
"""
Add a new number to the sliding window.
"""
if not self.max_heap or num <= -self.max_heap[0]:
heapq.heappush(self.max_heap, -num)
else:
heapq.heappush(self.min_heap, num)
self.current_size += 1
if self.current_size > self.k:
self.pop()
self._balance_heaps()
def pop(self):
"""
Remove the oldest element in the sliding window.
"""
# Remove the oldest element
oldest = -self.max_heap[0] if self.max_heap and (-self.max_heap[0] <= self.min_heap[0]) else self.min_heap[0]
self.to_remove[oldest] += 1
if oldest <= -self.max_heap[0]:
self._clean_heap(self.max_heap)
else:
self._clean_heap(self.min_heap)
self.current_size -= 1
self._balance_heaps()
def get_median(self):
"""
Get the current median of the sliding window.
"""
self._clean_heap(self.max_heap)
self._clean_heap(self.min_heap)
if len(self.max_heap) > len(self.min_heap):
return -self.max_heap[0]
else:
return (-self.max_heap[0] + self.min_heap[0]) / 2
# Example usage and testing
data_stream = [1, 2, 3, 3, 2, -1, 2]
k = 3
track = DataStream(k=3)
for val in data_stream:
track.push(val)
print(f"Window: {list(track.data)}, Average: {track.get_average()}, Max: {track.get_max()}, Mode: {track.get_mode()}, Median: {track.get_median()}")
There is a silly problem on stackexchange of connecting shoelaces (some call it spaghetti) into loops and counting how many loops you have, by randomly picking ends of the shoelaces (if you picked two different shoelaces, they merge into one). The brainteaser is asking for the expected number of loops (when you uniformly sample the shoelaces’ ends). The problem can be solved analytically and also simulated. The following is the code.
def theo(n):
res = 0
for i in range(1, n+1):
res = res + (1 / (2 * i - 1))
return res
print(theo(100))
import numpy as np
def simulate_outer(n):
memo = {}
def simulate(n):
""" Generates a stochastic output in n noodles. """
if n == 1:
return 1
if n in memo:
return memo[n]
# generate a probability
a = np.random.uniform()
if a <= 1/(2*n-1):
# formed a loop
return 1 + simulate(n-1)
else:
return simulate(n-1)
return simulate(n)
n = 100
nmc = 10000
print(sum([simulate_outer(100) for _ in range(nmc)])/nmc)
import numpy as np
def simulate_noodles(n):
"""Simulate the number of loops formed by randomly connecting n noodles."""
# ends -> noodle
ends = {i: None for i in range(2*n)}
# assign noodles
for i in range(n):
# (the other end, which noodle)
ends[2*i] = (2*i+1, i)
ends[2*i+1] = (2*i, i)
loops = 0
k = n
while k > 0:
left, right = np.random.choice(list(ends.keys()), 2, replace=False)
# if belong to the same noodle
if ends[left][1] == ends[right][1]:
loops = loops + 1
ends.pop(left)
ends.pop(right)
# if does not belong to the same noodle, combine into one
else:
other_end1, noodle1 = ends.pop(left)
other_end2, noodle2 = ends.pop(right)
ends[other_end1] = (other_end2, noodle1)
k = k - 1
return loops
n = 100
nmc = 10000
print(sum([simulate_outer(100) for _ in range(nmc)])/nmc)
They should all give something close to 3.284
.
The following are equivalent:
The chain is recurrent
For every \(x, y\in A\) where \(A\) is a countable set (state space), we have \(\mathbb{E}^x[V_y] = \infty\), where \(V_x = \sum_{n=0}^{\infty}\mathbf{1}\{X_n=x\}\).
There exists \(x, y\in A\) such that \(\mathbb{E}^x[V_y] = \infty\)
For every \(x,y\in A\), \(\mathbb{P}^x\{V_y = \infty\} = 1\)
There exists \(x,y \in A\) such that \(\mathbb{P}^x\{V_y = \infty\} = 1\).
Let a Markov chain be described as a sequence of random variables \(X_0,X_1,X_2,\cdots\) taking values in an at most countable set \({A}\), that satisfies Markov property and time-homogeneity.
Let the Markov chain be recurrent, let \(x,y\in A\) be arbitrary. Then:
\[ \mathbb{E}^x[V_y] = \mathbb{E}^x\bigg[\sum_{i=0}^{\infty}\mathbf{1}(X_n = y)\bigg] = \sum_{n=0}^{\infty}p_n(x,y) \] since the chain is recurrent, let \(\tau_y = \min\{k\ge 1: X_k=y\}\) and we have \(\tau_y = C < \infty\) for some constant \(C\) almost surely. This also implies \(p_C(x,y) = 1, p_{2C}(x,y) = 1, \cdots, p_{kC}(x,y) = 1,\cdots\), therefore we have \(\mathbb{E}^x[V_y] = \infty\) (due to time-homogeneity).
Statement 3, 5 follow directly from statements 2, 4.
Now we show that statement 2 implies statement 4. We have that there exists \(C<\infty\) such that \(1= p_C(x,y) = \cdots = p_{kC}(x,y) = \cdots\). This means \(1=\mathbf{1}\{X_{C}=y\} = \cdots = \mathbf{1}\{X_{kC}=y\}\) almost surely, thus almost surely we have \(V_y = \infty\), or \(\mathbb{P}^x\{V_y=\infty\} = 1\).
Finally it is enough to demonstrate statement 3 implies statement 2. Since the chain is irreducible, there exists \(n_x\) such that \(p_{n_x}(x,y)>0\), and there exists \(n_y\) such that \(p_{n_y}(y,x)>0\). We consider \(p(y,x)\) (probability of starting from \(y\) and reaching \(x\)). By total probability, we find: \begin{equation} p_{n_y+kC+n_x}(y,x) \ge p_{n_y}(y,x)p_{kC}(x,y)p_{n_x}(y,x) \end{equation} namely one way of reaching \(x\) in \(n_y+kC+n_x\) steps from \(y\) is to go to \(x\), go back to \(y\), and then go to \(x\). Summing over \(k\) on both sides, we have: \begin{equation} \sum_{k=1}^{\infty}p_{n_y+kC+n_x}(y,x) \ge p_{n_y}\bigg(\sum_{k=1}^{\infty}p_{kC}(x,y)\bigg)p_{n_x}(y,x) = \infty \end{equation} this implies \(\mathbf{1}_{\{n_y +kC + n_x\}} = 1\) must occur infinitely often. Thus we conclude \(x\) gets visited infinitely many times starting from \(y\), given \(y\) gets visited infinitely many times starting from \(x\). With the same argument applied to any node on the path, we conclude that in an irreducible Markov chain, every node is recurrent. ▪
\begin{equation} p(x, x+1) = q, p(x, x-1) = 1-q \end{equation} with \(q\in (0,1)\). Let \(p_n = p_n(0,0) = \mathbb{P}^0\{X_n = 0\}\). We show the following properties:
If \(n\) is odd, \(p_n=0\)
Assuming Stirling’s formula: \(\lim_{n\rightarrow\infty}\frac{n!}{n^{n+1/2}e^{-n}} = \sqrt{2\pi}\) we have if \(q=1/2\), then: \(\lim_{n\rightarrow\infty}n^{1/2}p_{2n} = 1/\sqrt{\pi}\)
Each step of transition the walker can only go up or down by 1. Starting from the origin, to get back to the origin it must go up exactly \(n/2\) times and down \(n/2\) times along the way. And this is not achievable by odd \(n\).
As in (1), in \(2n\) steps, we must have \(n\) steps going up, and \(n\) steps going down. Each step is a Bernoulli-\(q\) trial leading to \(+1\) or \(-1\). Thus the formula follows from binomial distribution.
We begin with: \begin{equation} p_{2n} = {2n \choose n}\frac{1}{2^{2n}} = \frac{(2n)!}{(n!)^2}\cdot\frac{1}{2^{2n}} \end{equation} we try to put the expression above into the form given by Stirling. \begin{equation} n^{1/2}p_{2n} = \frac{(2n)!}{(n!)(n!)}\cdot \frac{n^{1/2}}{2^{2n}} = \frac{(2n)!}{(n!)(n!)}\cdot \frac{(n^{n+1/2}e^{-n})^2}{(2n)^{2n+1/2}e^{-2n}}\cdot 2^{-2n}\cdot C_n\cdot\sqrt{n} \end{equation} where \(C_n\) is the reciprocal of the middle term. Then the middle term: \begin{equation} \frac{(n^{n+1/2}e^{-n})^2}{(2n)^{2n+1/2}e^{-2n}} = \frac{n^{2n+1}e^{-2n}}{(2n)^{2n+1/2}e^{-2n}} = 2^{-(2n+\frac12)}\cdot n^{2n+1-2n-\frac12} = 2^{-2n}\cdot \frac{1}{\sqrt{2}}\cdot\sqrt{n} \end{equation} then we find: \begin{equation} C_n = 2^{2n}\sqrt{2}\frac{1}{\sqrt{n}} \end{equation} then taking limit, using product of limits, using Stirling, and multiplying by \(C_n\), we find: \begin{equation} \lim_{n\rightarrow\infty}n^{1/2}p_{2n} = \sqrt{2\pi}\cdot\frac{1}{2\pi}\cdot\sqrt{2} = \frac{1}{\sqrt{\pi}} \end{equation}
The above result shows that for \(q=1/2\), \begin{equation} p_{2n}=\sum \frac{1}{\sqrt{\pi n}} \end{equation} for \(n\) large, then we have: \begin{equation} \sum_{k=0}^{\infty}p_k(0,0) = \sum_{n=0}^{\infty}p_{2n}(0,0) = \frac{1}{\sqrt{\pi}}\sum_{n=0}^{\infty}\frac{1}{\sqrt{n}} = \infty \end{equation} using 1.3’s results, this implies that the chain is recurrent.
Similar to the \(p=\frac12\) case, we attempt to derive a closed form formula using Stirling’s approximation. We previously had: \begin{equation} p_{2n} = {2n\choose n}q^{n}(1-q)^{n} = \frac{(2n)!}{(n!)(n!)}[q(1-q)]^n = \frac{(2n)!}{(n!)(n!)}\cdot \frac{(n^{n+\frac12}e^{-n})^2}{(2n)^{2n+\frac12}e^{-2n}}\cdot C_n\cdot q^n(1-q)^n \end{equation} where \begin{equation} C_n = \frac{(2n)^{2n+\frac12}\cdot e^{-2n}}{(n^{n+\frac12}e^{-n})^2} = \frac{2^{2n+\frac12}n^{2n+\frac12}e^{-2n}}{n^{2n+1}e^{-2n}} = 2^{2n+\frac12}\cdot\frac{1}{\sqrt{n}} \end{equation} take \(n\rightarrow\infty\), we have: \begin{equation} \sqrt{n}p_{2n}\sim \sqrt{n}\cdot\sqrt{2\pi}\cdot\frac{1}{2\pi}\cdot 2^{2n+\frac12}\cdot \frac{1}{\sqrt{n}}q^n(1-q)^n \end{equation} \begin{equation} \sim \frac{1}{\sqrt{\pi}}[4q(1-q)]^n \end{equation}
This means as \(n\) becomes large, \(p_{2n}\) behaves like: \begin{equation} p_{2n} \sim \frac{1}{\sqrt{n}}[4q(1-q)]^n \end{equation} now sum over \(n\):
\begin{equation} \sum_{n=0}^{\infty}p_{2n} \sim \sum_{n=0}^{\infty}\frac{1}{\sqrt{n}}(4q(1-q))^n \end{equation} we notice that for \(q\neq 1/2\), \(q(1-q)<1/4\) since it is maximized at \((1/2, 1/4)\). This means \(4q(1-q)<1\). Now we see that each term in this series decays faster than a geometric series with terms \(<1\) (due to the \(1/\sqrt{n}\)) term. We conclude that \(\sum_n p_{2n}<\infty\), or the chain is transient. ▪
The Bernoulli random variable, or indicator function on an event \(E\subset\Omega\), is a random variable \(\Omega\rightarrow \\{0,1\\}\) with the following definition:
[ \mathbf{1}_{E}(\omega) = \begin{cases} 1, \text{ if \(\omega\in E\) }
0, \text{ else} \end{cases} ]
Let uppercase letters denote events.
Multiplication: [ \mathbf{1}{E_i} \cdot \mathbf{1}{E_j} = \begin{cases} 1, \text{ if \(\omega \in E_i\) and \(\omega \in E_j\)}
0, \text{ otherwise} \end{cases} = \mathbf{1}_{E_i\cap E_j} ]
Complement: [ \mathbf{1}{\overline{E}} = 1 - \mathbf{1}{E} ]
Union: By De Morgan’s law, for finite \(N\), \(\overline{\bigcup_{i=1}^nE_i} = \bigcap_{i=1}^N\overline{E}_i\).
[ \mathbf{1}{\bigcup_i E_i} = 1 - \mathbf{1}{\overline{\bigcup_i E_i}} ]
[ = 1 - \mathbf{1}{\bigcap_i\overline{E}_i} = 1 - \prod{i=1}^N\mathbf{1}_{\overline{E}_i} ]
[ = 1 - \prod_{i=1}^N(1-\mathbf{1}_{E_i}) ] which relates to the principle of inclusion-exclusion.
Let \(A\in\Omega\) be an event, which occurs with probability \(p_A\) (we assume it has nonzero probability for now).
[ \mathbb{E}[\mathbf{1}_A] = 0\cdot (1-p_A) + 1\cdot p_A = p_A ]
[ \text{Var}(\mathbf{1}_A) = \mathbb{E}[\mathbf{1}_A^2] - \mathbb{E}[\mathbf{1}_A]^2 = 1^2\cdot p_A + 0^2\cdot (1-p_A) - p_A^2 = p_A-p_A^2= p_A(1-p_A) ]
Let \(A,B\) be two events with nonzero probabilities \(p_A,p_B\).
[ \text{cov}[\mathbf{1}_A, \mathbf{1}_B] = \mathbb{E}[\mathbf{1}_A\mathbf{1}_B] - \mathbb{E}[\mathbf{1}_A]\cdot\mathbb{E}[\mathbf{1}_B] = \mathbb{P}(A\cap B)- p_Ap_B ] where \(\mathbb{P}(A\cap B)\) is unknown.
[ \rho_{A,B} = \frac{\text{cov}[\mathbf{1}_A, \mathbf{1}_B]}{\sigma_A\cdot\sigma_B} ] where \(\sigma_{(\cdot)}\) denotes standard deviation.
In general:
[ \mathbb{E}[(\mathbf{1}_A)^n] = p_A ]
We noticed that the covariance and correlation depend on the quantity \(\mathbb{P}(A\cap B)\), which depending on the structure of sets \(A, B\) can vary. We provide a lower and upper bound.
We have:
[ \underbrace{0}{\text{disjoint events}} \le \mathbb{P}(A\cap B) \le \underbrace{ \min(p_A, p_B) }{\text{one of is contained in the other. }} ]
Therefore:
[ -p_Ap_B \le \text{cov}[\mathbf{1}_A, \mathbf{1}_B] \le \min(p_A, p_B) - p_Ap_B = \min(p_A, p_B)(1- \max(p_A, p_B)) ]
And:
[ -\sqrt{\frac{p_A}{1-p_A}}\cdot\sqrt{\frac{p_B}{1-p_B}} \le \rho_{A,B} \le \frac{\min(p_A, p_B)(1- \max(p_A, p_B))}{\sqrt{p_A(1-p_A)}\cdot\sqrt{p_B(1-p_B)}} ]
Let \(E\) be an event with probability measure 0, \(p_E = 0\). Let \(X\) be any random variable with well-defined expected value. Then:
[ \mathbb{E}[X\mathbf{1}E] = \int{\Omega}X(\omega)\mathbf{1}E(\omega)d\mathbb{P}(\omega) = \int_E(\cdot) + \int{\overline{E}}(\cdot) = 0 ] the first integral is over a null set, which yields 0, and in the second integral, the indicator always evaluates to 0, therefore it also becomes 0, hence the final equality.
The following property concerns taking expected value of a random variable only when it exceeds some threshold value. Let \(X\) be a random variable that is almost surely positive, and \(c\) be a constant. Then define the event \(C = \\{X\ge c\\}\):
[ \mathbb{E}[X] = \mathbb{E}[X(\mathbf{1}{C} + \mathbf{1}{\overline{C}}] \ge \mathbb{E}[X\mathbf{1}_{C}] ]
Let \(X_1,\ldots, X_n\) be i.i.d. random variables with density \(f_X(x)\) and cumulative density function (CDF) \(F_X(x)\). We define two random variables:
\(L_n = \min\\{X_1,\ldots, X_n\\}\), \(U_n = \max\\{X_1,\ldots, X_n\\}\). The natural curiosity is to compute some statistics with them. Before that, we need to characterize their densities, which is not difficult due to the following set equivalences:
[ {L_n \ge x} = {X_1 \ge x}\cap \cdots\cap {X_n \ge x} ]
[ {U_n \le x} = {X_1 \le x}\cap \cdots\cap {X_n \le x} ]
[ \mathbb{P}(L_n \ge x) = 1 - F_{L_n}(x) ] therefore it is enough to derive \(\mathbb{P}(L_n \ge x)\), which gives the following expression due to i.i.d.:
[ \mathbb{P}(L_n \ge x) = \bigg[ 1 - F_{X}(x) \bigg]^n ] rearranging:
[ F_{L_n}(x) = 1 - (1 - F_{X}(x))^n ] taking derivative on both sides to get the PDF:
[ f_{L_n}(x)=\frac{dF_{L_n}}{dx} = n(1-F_X(x))^{n-1}\cdot\frac{dF_X}{dx} = n(1-F_X(x))^{n-1}f_X(x) ]
By similar argument as for \(L_n\) using i.i.d., we have:
[ f_{U_n}(x) = nf_X(x)[F_X(x)]^{n-1} ]
We now consider some concrete examples.
Let \(X_1,\ldots, X_n\) be i.i.d. uniform \([0,1]\) random variables.
The PDF and CDF of uniform random variable are:
[ f_X(x) = 1, F_X(x) = x, x\in [0,1] ]
Then using the formula for \(U_n\):
[ f_{U_n}(x) = nx^{n-1}, F_{U_n}(x) = x^n ]
[ \mathbb{E}[U_n] = \int_0^1x\cdot nx^{n-1}dx = \frac{n}{n+1} ]
Therefore:
[ \text{Var}[U_n] = \mathbb{E}[U_n^2] - \mathbb{E}[U_n]^2 = \frac{n}{n+2} - \frac{n^2}{(n+1)^2} = \frac{2n^3+4n^2+n}{n^3+4n^2+5n+2} ]
Plugging in the formula:
[ f_{L_n}(x) = n(1-F_X(x))^{n-1}f_X(x) = n(1-x)^{n-1} ]
[ F_{L_n}(x) = \bigg[ 1 - F_{X}(x) \bigg]^n = (1-x)^n ]
[ \mathbb{E}[L_n] = \int_0^1xn(1-x)^{n-1}dx = \frac{1}{n+1} ]
[ \mathbb{E}[L_n^2] = \int_0^1x^2n(1-x)^{n-1}dx = n\int_0^1[u^{n+1}-2u^n+u^{n-1}]du ]
[ = \frac{n}{n+2}-\frac{2n}{n+1}+1 ]
[ \text{Var}(L_n) = \mathbb{E}[L_n^2] - \mathbb{E}[L_n]^2 ]
[ = \frac{n}{n+2}-\frac{2n}{n+1}+1 - \frac{1}{(n+1)^2} ]
[ = \frac{n}{n^3+4n^2+5n+2} ]
See the previous section for a background. In this post, we consider a simpler case (generalization is to be discussed later).
Let \(X_1,X_2\) be independent uniform \([0,1]\) random variables. Define \(L = \min\\{X_1,X_2\\}\), \(U = \max\\{X_1,X_2\\}\). We explore the covariance of \(L, U\).
From before, we have:
[ f_L(x) = -2x+2, F_L(x) = -x^2+2x, f_U(x) = 2x, F_U(x) = x^2 ]
Let’s compute:
[ \mathbb{P}(L\ge y|U\le z) = \frac{\mathbb{P}(L\ge y, U\le z)}{\mathbb{P}(U\le z)} ]
The denominator is straightforward since we know the CDF of \(U\):
[ \mathbb{P}(U\le z) = F_U(z) = z^2 ]
The numerator is equivalent to \(\mathbb{P}(y\le X_1\le z, y\le X_2\le z)\) (see the set equivalence of max and min in the previous post). Then we have:
[ \mathbb{P}(L\ge y|U\le z) = \frac{\mathbb{P}(L\ge y, U\le z)}{\mathbb{P}(U\le z)} = \frac{(z-y)^2}{z^2} ]
We have:
[ \text{cov}(L, U) = \mathbb{E}[LU] - \mathbb{E}[L]\cdot\mathbb{E}[U] ] in the previous post, we computed the means: \(\mathbb{E}[L] = \frac{1}{3}, \mathbb{E}[U] = \frac{2}{3}\). We need to consider: [ \mathbb{E}[LU] = \int_0^1\int_0^1xyf_{L,U}(x,y)dxdy ] where \(f_{L,U}\) is the joint density. We need to derive this from the following probability:
[ \mathbb{P}(L\le y, U\le z) ] by the set equivalence, \(\\{U\le z\\}\) is equivalent to \(\\{X_1\le z\cap X_2\le z\\}\). The minimum \(\\{L\le z\\}\) means either \(X_1\le z\) or \(X_2\le z\), but leaving the other one inconclusive. This suggests conditioning. By some simple mental argument, we realize that \(\\{L\le y, U\le z\\}\) is equivalent to the set:
[ E_{y,z} = {X_1\le z, X_2\le z} \setminus ({X_1\ge y, X_2\ge y}\cap{X_1\le z, X_2\le z}) ]
This set \(E_{y,z}\) has probability:
[ \mathbb{P}(E_{y,z}) = z^2 - (z-y)^2 =: F_{L,U}(y,z) ]
Given the joint CDF, the joint PDF is derived by taking partial derivatives:
[ f_{L,U}(y,z) = \frac{\partial^2F_{L,U}}{\partial y\partial z} = 2 ] from which, the cross expectation term can be computed:
[ \mathbb{E}[LU] = 2\int_0^1\int_0^1xydxdy = \frac{1}{4} ]
Finally, the covariance is finished:
[ \text{cov}(L,U) = \frac14 - \frac{2}{9} = \frac{1}{36} ]
The correlation is only one step away; we use the variance formula from the previous post:
[ \rho_{Y,Z} = \frac{\text{cov}(Y, Z)}{\sigma_Y\cdot\sigma_Z} = \sqrt{18^2}\cdot \frac{1}{36} = \frac12 ]
Now that we’ve solved a simple version (of \(X_1,X_2\)), it is natural to consider defining:
[ L_n = \min{X_1,\ldots, X_n} ]
[ U_n = \max{X_1,\ldots, X_n} ] and see what their correlation is.
This post is about finding a formula for general \(n\) case, with uniform \([0,1]\) distributions. See the previous problem here (\(n=2\)). Let us define for i.i.d. \(X_i\)’s:
[ L_n = \min_{1\le i\le n}X_i, U_n = \max_{1\le i\le n}X_i ]
Using the formulas from Order Statistics I, we have:
[ f_{L_n}(x) = n(1-\underbrace{F_X(x)}{=x})^{n-1}\underbrace{f_X(x)}{=1} = n(1-x)^{n-1} ]
[ F_{L_n}(x) = 1-(1-x)^n ]
[ f_{U_n}(x) = nx^{n-1}, F_{U_n}(x) = x^n ]
We need to compute the following:
[ \mathbb{E}[L_nU_n] = \int_0^1\int_0^1yzf_{L_n,U_n}(y,z)dydz ] by finding the joint density \(f_{L_n,U_n}(y,z)\). We take a similar approach of starting with the joint CDF: \(F_{L_n,U_n}(y,z) = \mathbb{P}(L_n\le y, U_n\le z)\). The set equivalence simply generalizes from squares to hypercubes:
[ E_{n;y,z} := {L_n\le y, U_n\le z} = \bigg[\bigcap_{i}{X_i\le z}\bigg]\setminus\bigg( \bigcap_i{X_i\in [y, z]} \bigg) ]
The probability measure is then:
[ \mathbb{P}(E_{n;y,z}) = \frac{\text{Vol}(E_{n;y,z})}{\text{Vol}([0,1]^n)} = z^n - (z-y)^n =: \mathbb{P}(L_n\le y, U_n\le z) ]
Now by taking partial derivatives we find:
[ f_{L_n,U_n}(y,z) = \frac{\partial^2F_{L_n,U_n}}{\partial y\partial z} = n(n-1)(z-y)^{n-2} ] we see if \(n=2\), it indeed evaluates to \(2\).
Then:
[ \mathbb{E}[L_nU_n] = n(n-1)\int_0^1\int_0^1yz(z-y)^{n-2}dydz ]
By the integral above, we have that:
[ \mathbb{E}[L_nU_n] = \begin{cases} \frac{1}{n+2}, \text{ if \(n\) is even}
0, \text{ if \(n\) is odd} \end{cases} ]
Then the covariance is:
[ \text{cov}(L_n, U_n) = \begin{cases} \frac{1}{(n+1)^2(n+2)}, \text{ if \(n\) is even}
-\frac{n}{(n+1)^2}, \text{ if \(n\) is odd} \end{cases} ]
[ \rho_{L_n,U_n} = \frac{\text{cov}(L_n,U_n)}{\sigma_{L_n}\cdot\sigma_{U_n}} = \frac{ \frac{1}{(n+1)^2(n+2)}}{ \sqrt{ \frac{n(2n(n+2)+1)}{(n+1)^2(n+2)}\cdot\frac{n}{(n+1)^2(n+2)} } } = \frac{1}{\sqrt{n}\cdot\sqrt{n}}= \frac{1}{n} ] if we let \(n=2\), we see that we indeed recover \(\frac12\), from Order Statistics II.
[ \rho_{L_n,U_n} = \frac{ -\frac{n}{(n+1)^2} }{ \sqrt{ \frac{n(2n(n+2)+1)}{(n+1)^2(n+2)}\cdot\frac{n}{(n+1)^2(n+2)} } } = -\frac1n ]
We’ve already done a bit with order statistics of uniform random variables, see Order Stats I, Order Stats II and Order Stats III. In this post, we start playing with different densities.
The first density that comes to mind is standard normal. As usual, we will start with two variables – the simple case.
Define: \(U = \max\{X,Y\}\) where \(X, Y\) are independent \(\mathcal{N}(0,1)\) random variables.
We consider again the probability:
[ F_U(z) = \mathbb{P}(U \le z) = \mathbb{P}(X \le z)^2 = F_X^2(z) ] by set equivalence. Then:
[ f_U(z) = \frac{dF_U}{dz} = 2F_X(z)f_X(z) = \Phi(z)\cdot\sqrt{\frac{2}{\pi}}\exp(-\frac12z^2) ] which involves the CDF function of the normal density, \(\Phi\).
For the mean of \(U\), instead of direct algebraic computations, we use law of total expectation:
[ \mathbb{E}[U] = \mathbb{E}[U|X>Y]\mathbb{P}(X>Y) + \mathbb{E}[U|X\le Y]\mathbb{P}(X\le Y) = 2\mathbb{E}[U|X>Y]\mathbb{P}(X>Y) ] the last equality is by symmetry. Furthermore, \(\mathbb{P}(X>Y)=\frac12\) due to their joint distribution being symmetric around the line \(y=x\). Then it is enough to compute:
[ \mathbb{E}[U|X>Y] = \mathbb{E}[X|X>Y] ]
We then apply conditional tower property and write:
[ \mathbb{E}[X|X>Y] = \mathbb{E}[\mathbb{E}[X|Y]|X>Y] ] now it amounts to deriving an expression for \(\mathbb{E}[X|Y=y]\). But \(X,Y\) are independent, therefore we only have \(\mathbb{E}[X]\), which is equal to 0. The computations stop here, not super interesting.
We take a detour to consider the case when \(X,Y\) are not independent; suppose they follow a joint Gaussian distribution:
[ \mathcal{N}( \left[ \begin{array}{c} 0
0
\end{array} \right], \left[ \begin{array}{cc} 1 & \rho
\rho & 1
\end{array} \right] ) ]
In this post, we consider a uniform distribution over a circle with radius \(r\). The distribution is specified as the following:
[ f_{X,Y}(x,y) = \begin{cases} \frac{1}{\pi r^2}, \text{ if } \sqrt{x^2+y^2}\le r
0, \text{ otherwise} \end{cases} ]
Therefore, to find expected distance \(d = \sqrt{x^2+y^2}\) from the center, it is enough to compute:
[ \mathbb{E}[d] = \int_{{(x,y): \sqrt{x^2+y^2}\le r}}\sqrt{x^2+y^2}f_{X,Y}(x,y)dxdy ] using polar coordinates: \(dxdy \mapsto rdrd\theta\), we have:
[ = \frac{1}{\pi r^2}\int_0^{2\pi}\int_0^rr^2drd\theta = \frac{1}{\pi r^2}\cdot \frac{2\pi r^3}{3} = \frac{2r}{3} ]
Similarly, we can attempt to find the second moment:
[ \mathbb{E}[d^2] = \int_{{(x,y): \sqrt{x^2+y^2}\le r}}(x^2+y^2)f_{X,Y}(x,y)dxdy = \frac{1}{\pi r^2}\int_0^{2\pi}\int_0^rr^3drd\theta = \frac12r^2 ]
Then:
[ \text{Var}(d) = \mathbb{E}[d^2] - \mathbb{E}[d]^2 = \frac12r^2 - \frac{4}{9}r^2 = \frac{1}{18}r^2 ]
The Poisson process is a counting process, denoted as \(N_t\), that describes the number of random arrivals within the time interval \([0,t]\). The (discrete) distribution takes the form:
[ P(N_t = n) = \frac{(\lambda t)^n\exp(-\lambda t)}{n!} ] where \(\lambda\) is a parameter. We first verify that it indeed sums to 1:
[ \sum_{n=0}^{\infty}P(N_t = n) = \sum_{n=0}^{\infty}\frac{(\lambda t)^n\exp(-\lambda t)}{n!} ] recall the formula for \(e^x\):
[ e^x = \sum_{n=0}^{\infty}\frac{x^n}{n!} ] here let \(x=\lambda t\), we see that the sum is 1.
We then derive the mean and variance of this random variable.
[ \mathbb{E}[N_t] = \sum_{n=0}^{\infty}n\cdot\frac{(\lambda t)^n\exp(-\lambda t)}{n!} = 0 + \sum_{n=1}^{\infty}n\cdot\frac{(\lambda t)^{n}\exp(-\lambda t)}{n!} ]
[ = (\lambda t)\sum_{n=1}^{\infty}\frac{(\lambda t)^{n-1}\exp(-\lambda t)}{(n-1)!} ] let \(m=n-1\), we have: [ = (\lambda t)\underbrace{\sum_{m=0}^{\infty}\frac{(\lambda t)^m\exp(-\lambda t)}{m!}}_{=1} = \lambda t ]
The variance derivation follows the same format of isolating a \(\lambda t\) term, except that now one starts with \(n^2\) in the summation. After cancelling with a factor of \(n\) in the denominator, one is left with [n(n-1)] in the summation, which is further cancelled with an \((n-1)\) factor from the remaining factorial [(n-1)!] in the denominator. Finally, one writes [n = (n-1)+1] in the sum, and break the sum into two separate ones. The first sum is the same case as the mean. And a factor \((\lambda t)^2\) can be taken from the second sum, effectively leaving \(\mathbb{E}[N_t^2] = (\lambda t) + (\lambda t)^2\). Subtracting [\mathbb{E}[N_t]^2] as required in the variance calculation \(\text{Var}[N_t] = \mathbb{E}[N_t^2]-\mathbb{E}[N_t]^2\), one comes to the conclusion that the variance is also \(\lambda t\).
What does [P(X = a) = 1] mean for a random variable? —
[X] is a random variable that is equal to \(1\) with probability one. It is tempting to say that [X] is constant. However, we need to remember that [X] is not a variable in the traditional sense, but a mapping [\Omega\rightarrow \mathbb{R}] from probability space to (a subset of) real numbers.
If we define [X] as the following:
[ X(\omega) = \begin{cases} a, & \text{ if } \omega\in\mathbb{R}\setminus\mathbb{Q}
0, & \text{ if } \omega\in\mathbb{Q} \end{cases} ]
We see that the set where [{X\neq a}] has Lebesgue measure 0. But [X] is not constant.
Let \(W_t\) denote a standard Brownian motion, in this post, we walk through the derivation of the joint density for [Z = (W_t, \int_0^tW_sds)], which reviews a few key properties of Brownian motions.
Define the random vector:
[ Z_t^{(n)} = (W_t, \sum_{j=1}^nW_{t_j}(t_j-t_{j-1})) ] where [t_j = jt/n], for some [j\in {1,2,\ldots, n}], and defines a partition.
[Z_t^{(n)}\rightarrow Z_t] almost surely. Since a linear combination of Gaussian random variables is still Gaussian, this means that the Ito integral [\int_0^tW_sds] is also Gaussian.
Furthermore, [Z_t^{(n)}] is jointly Gaussian for all [n], [t>0].
Therefore, it is enough to determine the mean and covariance matrix of [Z_t].
We have:
[\mathbb{E}[W_t] = 0, \text{Var}[W_t] = t] for all [t].
[\mathbb{E}[\int_0^tW_tdt] = 0] as each linear combination has mean 0.
It remains to consider:
[ \text{Var}[\int_0^tW_sds] = \mathbb{E}\bigg[ \bigg( \int_0^tW_sds \bigg)^2 \bigg] ]
[ = \mathbb{E}\bigg[ \int_0^t\int_0^tW_sW_{s’}dsds’ \bigg] = \int_0^t\int_0^t\mathbb{E}[W_sW_{s’}]dsds’ ]
[ = \int_0^t\int_0^t\min{s,s’}dsds’ = \frac{1}{3}t^3 ] here we used the fact that \(W_t\) has independent increments, thus for \(a<b\), \(\mathbb{E}[W_aW_b] = \mathbb{E}[W_a(W_a+W_b-W_a)] = a + \mathbb{E}[W_b]\cdot\underbrace{\mathbb{E}[W_b-W_a]}_{=0} = a\). By symmetry, it is equal to \(b\) if \(b<a\). Thus it comes to \(\min\{a,b\}\).
Finally:
[ \text{Cov}[W_t, \int_0^tW_sds] = \mathbb{E}\bigg[ W_t\int_0^{t}W_sds \bigg] ]
[ = \int_0^t\underbrace{\mathbb{E}[W_tW_s]}_{=s, \text{ since \(s<t\)}}ds = \int_0^tsds = \frac12t^2 ]
Therefore, we see that:
[ Z_t \sim \mathcal{N}\bigg( \left[ \begin{array}{c} 0
0
\end{array} \right], \left[ \begin{array}{cc} t & t^2/2
t^2/2 & t^3/3
\end{array} \right] \bigg) ]
When one mentions matrix calculus, one typically means gradient/jacobian with respect to non-scalar inputs those types that you can play with in this Matrix Calculus calculator.
This post records useful properties when the matrix is parameterized. That is functions of the form: \(A(t):\mathbb{R}\rightarrow\mathbb{R}^{m\times n}\). Smoothness properties / well-definedness of derivatives are always assumed.
[ A’(t) = \left[ \begin{array}{cccc} a_{11}’(t) & a_{12}’(t) & \cdots & a_{1n}’(t)
a_{21}’(t) & a_{22}’(t) & \cdots & a_{2n}’(t)
\vdots & \cdots & \ddots & \vdots
a_{n1}’(t) & a_{n2}’(t) & \cdots & a_{nn}’(t) \end{array} \right] ]
[ \frac{dA^{-1}}{dt} = -A^{-1}(t)A’(t)A^{-1}(t) ]
Proof:
[ I = A^{-1}(t)A(t) ] then take derivatives on both sides and use product rule:
[ \mathbf{0} = \bigg[\frac{dA^{-1}}{dt}\bigg]A(t) + A^{-1}(t)A’(t) ] then rearrange.
Let \(\mathbf{v}(t),\mathbf{w}(t)\) be vectors parameterized by \(t\), then product rule gives:
[ \frac{d}{dt}[\mathbf{v}^T(t)\mathbf{w}(t)] = \bigg[\frac{d\mathbf{v}}{dt}\bigg]^T\mathbf{w}(t) + [\mathbf{v}(t)]^T\bigg[\frac{d\mathbf{w}}{dt}\bigg] ]
[ \frac{d\log\det A(t)}{dt} = \text{tr}(A^{-1}(t)A’(t)) ]
[ \frac{d}{dt}[A^{-T}A^{-1}] = \frac{d(A^{-1})^T}{dt}A^{-1}+(A^{-1})^T\frac{dA^{-1}}{dt} ]
[ = \bigg( \frac{dA^{-1}}{dt} \bigg)^TA^{-1} + (A^{-1})^T\frac{dA^{-1}}{dt} ]
Using the previous formula for matrix inverse:
[ = -A^{-T}\frac{dA}{dt}A^{-T}A^{-1} - A^{-T}A^{-1}\frac{dA}{dt}A^{-1} ]
The function \(\exp(-x^2)\) is an example of a function without an elementary antiderivative. But since \(\exp(-x^2)\) rapidly decreases at positive or negative infinities, we should expect to compute some finite number.
Let \(I = \int_0^{\infty}e^{-x^2}dx\), then: [ I^2 = \bigg(\int_0^{\infty}e^{-x^2}dx\bigg)^2 = \bigg(\int_0^{\infty}e^{-x^2}dx\bigg)\bigg(\int_0^{\infty}e^{-y^2}dy\bigg) ] the second equality is just renaming the dummy variable.
Then:
[ = \int_0^{\infty}e^{-(x^2+y^2)}dxdy ]
Now we use polar coordinates, [r = \sqrt{x^2+y^2}], [x=r\cos\theta], [y=r\sin\theta]. After substitution, and [dxdy = rdrd\theta] (the angle runs from \(0\) to \(\pi/2\) since \((x,y)\) covers that he first quadrant).
We have: [ = \int_0^{\pi/2}\int_0^{\infty}e^{-r^2}rdrd\theta ]
which (inner) is a 1-dimensional integral. The function [re^{-r^2}] does not depend on [\theta], we can replace the outer integral with a constant [\pi/2]. To integrate [re^{-r^2}], let \(u = r^2\), then [\frac12du = rdr].
Finally we have: [ = \frac{\pi}{4}\cdot\int_0^{\infty}e^{-u}du = \frac{\pi}{4}\cdot \underbrace{\bigg(0 + 1 \bigg)}{\lim{u\rightarrow\infty}(-e^{-u}) - (-e^{-0})} = \frac{\pi}{4} ]
Since [I^2 = \frac{\pi}{4}], the original integral is [\sqrt{\pi}/2].
This set of posts are written to record interesting integral problems. That is, problems that contain some tricks or geometrical argument. The integrals are often in the Lebesgue sense. In this post, we integrate:
[ I(z) = \int_0^{z}\int_0^z\min(x,y)dxdy ] the domain \(\\{0\le x\le z, 0\le y\le z\\}\) is separated by the set \(\\{y=x\\}\). The subset below the straight line \(y=x\) means \(y\le x\), and being above the straight line means \(y\ge x\). Then we have:
[ I(z) = \int_{{y\le x, x\in [0,z]}}\min(x,y)dydx + \int_{{y\ge x, y\in [0,z]}}dydx = 2\int_{{y\le x, x\in [0,z]}}\min(x,y)dydx ] the last equality is by symmetry (if we rename \(x,y\)). Then:
[ = 2\int_{0}^zydydx = 2z ]
]]>This is defined with respect to a finite number of (standard) normal random variables \(\eta := [\eta_1, \eta_2, \ldots, \eta_d]^T\). \(z\) is a fixed constant, and \(F: \mathbb{R}^d\rightarrow\mathbb{R}\) is some (nonrandom) function that defines the event of interest:
\[E_z := \{\eta: F(\eta) > z\}\]If we recall the density for multivariate Gaussian random vectors, we have:
\[p_{\eta}(s) = \frac{1}{(2\pi)^{d/2}}\exp\bigg( -\frac12||s||_2^2 \bigg)\]Then we have the integral:
\[\mathbb{P}(E_z) = \frac{1}{(2\pi)^{d/2}}\int_{\mathbb{R}^d}\mathbf{1}_{E_z}\cdot \exp\bigg( -\frac12||s||^2 \bigg)ds\]where \(\mathbf{1}_{E_z}(s)\) is the indicator function that is 1 for any \(s\in E_z\).
The issues preventing us from evaluating this integral in general are that:
\(F\) potentially may be difficult to evaluate.
The boundaries of integration may not have simple geometry.
However, what we can notice is that the density function is rapidly decreasing with respect to \(\|s\|^2\). Forgetting about the set \(E_z\) for now and suppose we want to integrate with respect to the entire space (which should give us 1). The minimizer would certainly be the mode, \(s^* = [0, \ldots, 0]^T\), then we recover 1 as the final integral indeed.
For more complicated sets \(E_z\), it might be interesting to postulate that only points \(s\) within some \(\delta\) ball around a constrained minimizer, will matter in the integral.
That is:
\[\mathbb{P}(F(\eta) > z) \approx \frac{1}{(2\pi)^{d/2}}\int_{E_z}\exp\bigg( -\frac12||s^*||^2 \bigg)ds = \frac{\mu(E_z)}{(2\pi)^{d/2}}\exp(-\frac12||s^*||^2)\]where \(\mu(\cdot)\) denotes the Lebesgue measure. And since \(E_z\) does not necessarily contain \(0\), we define:
\begin{equation} \label{eqn:constrained-optimization} s^* := \text{argmin}_{s\in E_z}\frac12||s||^2 \end{equation}
Assuming that the above works well, evaluating the desired probability numerically has been converted into two sub-problems:
Evaluate \(\mu(E_z)\).
Solve constrained optimization problem \eqref{eqn:constrained-optimization}.
In particular, the above says roughly
\[\log \mathbb{P}(F(\eta) > z) \sim -\frac12\|s^*\|^2\]]]>\begin{equation} y_i \approx Ax_i \end{equation}
for all \(i=1,2,\ldots, m\). The natural start is to first collect the vectors in matrices:
\[X = \begin{bmatrix} \mid & \mid & \mid & \mid\\ x_1 & x_2 & \cdots & x_m \\ \mid & \mid & \mid & \mid \end{bmatrix}, Y = \begin{bmatrix} \mid & \mid & \mid & \mid\\ y_1 & y_2 & \cdots & y_m \\ \mid & \mid & \mid & \mid \end{bmatrix}\]Then one aims to find
\[Y \approx AX\]Perhaps, by solving:
\[\min_{A\in\mathbb{R}^{d\times d}}|| Y - AX ||_F^2\]Suppose for simplicity that \(X\) is full-rank. An SVD of \(X\) would give us:
\[X = U\Sigma V^T\]\(U\in\mathbb{R}^{d\times d}, V\in \mathbb{R}^{m\times m}\) are orthogonal matrices, \(\Sigma\in\mathbb{R}^{d\times m}\) is diagonal, containing the singular values (assume sorted).
Then we have that the solution should be \(A^* = YX^{\dagger}\), where \(X^{\dagger}\) is the (left) pseudoinverse of \(X\), such that \(Y - YX^{\dagger}X = 0\).
We might wish to use this matrix to predict things given a vector. The following MATLAB code implements just this: let \(u\) be a set of training data, \(u_0\) is some new observation. We generate a sequence of new observations after fitting.
function u_new = predict(u0, u, m)
% ----------
% u: training data
% u0: initial condition
% m: number of snapshots to use
% ----------
[d, n] = size(u);
% take m snapshots
X = u(:, 1:m);
Y = u(:, 2:m+1);
% fit matrix
A = Y*pinv(X);
% generate m new vectors
u_new = zeros(d, m);
u_new(:, 1) = u0;
for i = 2:m
u_new(:,i) = A*u_new(:,i-1);
end
end
More generally, we might say \(y_i \approx f(x_i)\)
And minimize:
\[\min_{f}\sum_{i=1}^m||y_i-f(x_i)||_2^2\]Let’s stick to linear \(f\) and say:
\[y_i \approx x_i + Bx_i\]Then in matrix form:
\[Y-X = BX\]whose solution follows from before, and \(B^* = (Y-X)X^{\dagger}\).
function u_new = predict(u0, u, m)
% ----------
% u: training data
% u0: initial condition
% m: number of snapshots to use
% ----------
[d, n] = size(u);
% take m snapshots
X = u(:, 1:m);
Y = u(:, 2:m+1);
% fit matrix
B = (Y-X)*pinv(X);
% generate m new vectors
u_new = zeros(d, m);
u_new(:, 1) = u0;
for i = 2:m
u_new(:,i) = u_new(:,i-1) + B*u_new(:,i-1);
end
end
We can add some other terms, such as the following
\[y_i \approx Ax_i + b\]and find \(A, b\). This is readily solved from before because we can re-write:
\[Y \approx \begin{bmatrix} A & b \end{bmatrix} \cdot \begin{bmatrix} X\\ \mathbf{1}^T \end{bmatrix} =: \tilde{A}\tilde{X}\]where \(\mathbf{1}\) is a vector of all \(1\)’s.
function u_new = predict(u0, u, m)
% ----------
% u: training data
% u0: initial condition
% m: number of snapshots to use
% ----------
[d, n] = size(u);
% take m snapshots
X = u(:, 1:m);
X_tilde = [X; ones(m,1)'];
Y = u(:, 2:m+1);
% fit matrix
A_tilde = Y*pinv(X_tilde);
A = A_tilde(:,1:end-1);
b = A_tilde(:,end);
% generate m new vectors
u_new = zeros(d, m);
u_new(:, 1) = u0;
for i = 2:m
u_new(:,i) = A*u_new(:,i-1) + b;
end
end
We can also write:
\[y_i \approx x_i + Bx_i + b\]which has the same form as before.
function u_new = predict(u0, u, m)
% ----------
% u: training data
% u0: initial condition
% m: number of snapshots to use
% ----------
[d, n] = size(u);
% take m snapshots
X = u(:, 1:m);
X_tilde = [X; ones(m,1)'];
Y = u(:, 2:m+1);
% fit matrix
B_tilde = (Y-X)*pinv(X_tilde);
B = B_tilde(:,1:end-1);
b = B_tilde(:,end);
% generate m new vectors
u_new = zeros(d, m);
u_new(:, 1) = u0;
for i = 2:m
u_new(:,i) = u_new(:,i-1) + B*u_new(:,i-1) + b;
end
end
We will continue the code in another note, where we find slightly more interesting matrices. Also, it is not always necessary to assume \(X\) is full rank.
where \(\theta\in [-\pi,\pi]\) is the angle of displacement, \(\Omega = \frac{d\theta}{dt}\in\mathbb{R}\) is the angular velocity. We solve the system using different choices of initial conditions \((\theta(0),\Omega(0))^T = (\theta_0, \Omega_0)\).
Without damping, the mechanical energy should be conserved as:
\(E(t) = \frac12 \Omega(t)^2 - \cos(\theta(t)) = E_0, \forall t\in [0,T_m)\) where $T_m$ is the final time of observation.
Below we numerically solve the dynamical system from $t=0$ to $t=t_m$. We make use of scipy
’s built-in ODE integrator which by default uses fourth-order explicit Runge-Kutta time-stepping. We briefly describe the numerical scheme, consider the general initial value problem:
where \(\mathbf{x}\in\mathbb{R}^d\) is a $d$-dimensional vector and \(\mathbf{f}:\mathbb{R}^+ \times \mathbb{R}^d \rightarrow \mathbb{R}^d\) is a time-dependent vector field. In order to obtain a numerical solution, we need to rely on a discretized time grid \(T^{(0)}, T^{(1)},\cdots, T^{(N_t)}\). The Runge-Kutta method can be roughly considered as a weighted average of local slopes (taken at a number of well-chosen points on the grid) to approximate \(\frac{d\mathbf{x}}{dt}\). Assuming a uniform grid, \(\Delta t = T^{(n+1)}-T^{(n)}\),\(\mathbf{x}_n := \mathbf{x}(T^{(n)})\), the RK4 scheme for time step \(T^{(n)}\mapsto T^{(n+1)}\) is computed as follows:
\[k_1 = \mathbf{f}(T^{(n)},\mathbf{x}_n)\] \[k_2 = \mathbf{f}(T^{(n)}+\frac{\Delta t}{2}, \mathbf{x}_n + \Delta t \frac{k_1}{2})\] \[k_3 = \mathbf{f}(T^{(n)}+\frac{\Delta t}{2}, \mathbf{x}_n + \Delta t \frac{k_2}{2})\] \[k_4 = \mathbf{f}(T^{(n)}+\Delta t, \mathbf{x}_n + \Delta t {k_3})\] \[\mathbf{x}_{n+1} = \mathbf{x}_n + \frac{1}{6}\Delta t(k_1 + 2k_2 + 2k_3 + k_4)\]The period \(P\) of pendulum oscillations can be found from our dynamical system. \(P\in \mathbb{R}^+\) is such that \(\theta(t+P) = \theta(t)\).
\(E(t) = E_0 = \frac12\Omega(t)^2 - \cos(\theta(t))\) solving for \(\Omega(t)\) yields:
\[\Omega(t) = \pm \sqrt{2E_0 + \cos(\theta)}\]Consider the points where velocity is 0, substitute in \(\Omega = 0\): \(E_0 = -\cos(\theta)\)
solving for \(\theta\) we obtain:
\(\theta_m = \cos^{-1}(-E_0)\) $\theta_m$ would correspond to the highest point the pendulum can reach after releasing. Given \(E_0\), without loss of generality we consider a quarter-period in phase space, by symmetry, effectively \(\theta\in [0,\theta_m]\). Then:
\[\Omega = \frac{d\theta}{dt} = \sqrt{2E_0 + \cos(\theta)}\]then:
\[\frac{dt}{d\theta} = \frac{1}{\sqrt{2E_0 + \cos(\theta)}}\] \[dt = \frac{1}{\sqrt{2E_0 + \cos(\theta)}}d\theta\] \[\frac{P}{4} = \int_0^{\theta_m}\frac{1}{\sqrt{2E_0 + \cos(\theta)}}d\theta\]Alternatively, we may also solve the dynamical system numerically, and obatin the period \(P\) by observing the solution \(\theta(t)\). In our numerical evaluation below, we consider $P$ as a mapping of \(E_0\):
\[E_0 \mapsto \int_0^{\theta_m}\frac{1}{\sqrt{2E_0 + \cos(\theta)}}d\theta\]What is a “reduced-order PDF method”?
What is “defect”?
Why do these?
Consider the Van der Pol random ODE (RODE) model:
\[\begin{cases}\dot{x}(t) &= y(t) \\ \dot{y}(t) &= \mu(1-x(t)^2)y(t) - x(t) + \sigma\cdot \eta(t) \end{cases}\]where \(\eta\) is an Ornstein-Uhlenbeck (OU) process representing noisy perturbations of the system. In particular, it is a scaled temporal transform of the standard Wiener process. One may include the stochastic differential equation for \(\eta\) into the above system, whose probability density should be decribed by the Fokker-Planck equation, joint in all state variables \(x, y, \eta\).
Suppose we are only interested in the probability density of \(y\). Recall:
\[f_{x,y,\xi} = f_{x,\xi|y}\cdot f_{y}\]where \(f_{y|x,\xi}\) denotes a conditional density. Then we notice that we may derive an equation for \(f_y\) only, by integrating out the variables \(x, \xi\) in the joint Fokker-Planck equation, which should look like the following:
\begin{equation} \label{eqn:reduced-order-pdf} \partial_{t} f_y + \partial_{y}\bigg[ (\mu \cdot y \cdot \mathbb{E}[1-x^2|y] - \mathbb{E}[x|y] + \sigma \cdot \mathbb{E}[\eta | y])f_y \bigg] = 0 \end{equation}
or generically:
\[\partial_{t}f_y + \mathcal{L}f_y = 0\]where \(\mathcal{L}\) is a linear operator.
To solve the above equation, there is an issue that we need to deal with. The problem is that the integrals \(\mathbb{E}[x|y], \mathbb{E}[1-x^2|y]\) are not known analytically. Approximations are possible by either making assumptions about the noise
I am assuming that for practical engineering applications, the goal is to reduce the amount of “sitting in front of your desk with a pen and think for half a day”-type of experiences, and solve numerical problems that are rather generic (i.e. I throw you a system of stochastic equations, you give me the answer). Therefore, perhaps the methodology adopted in
\begin{equation} \label{eqn:regression-problem} \mathbb{E}[z(x, y, \xi)|y] \approx m(y;\theta) \end{equation}
The above has implicit time dependence. \(m(y;\theta)\) is a parameterized function approximator that fits some observed values \(\{z_i, y_i\}_{i=1}^{N}\). A good question is that why this is necessarily better than blindly plugging data of \(\{y_i\}_{i=1}^N\) into a kernel density estimator. Showing this rigorously requires a few more posts, however, the reason underlies “physics-informed” methods, which is an interpolation between using hand calculations to derive formulas, and using nonparameteric methods and not caring about formulas at all.
Suppose for now that we indeed use the regression method mentioned in \eqref{eqn:regression-problem}. Inevitably there will be errors, and that is kind of a problem since you are solving a PDE \eqref{eqn:reduced-order-pdf} with the wrong coefficients. At best (suppose we do not talk about pure math-type existence arguments), the numerical solution will have errors accumulating over time of the integration.
Suppose:
\begin{equation} \mathbb{E}[z|y] = m(y;\theta) + e(y; t) \end{equation}
where \(e\) is the (time-dependent) redisual of the estimation problem \eqref{eqn:regression-problem}. The residual enters through the PDE formally as the following:
\[\partial_t f_y + \widehat{\mathcal{L}}f_y = \underbrace{-\widehat{\mathcal{E}}f_y}_{\text{''model defect''}}\]where \(\mathcal{L} = \widehat{\mathcal{L}} + \widehat{\mathcal{E}}\), i.e. we make some error in the linear operator due to estimation.
One way to deal with this defect is to use observed values of \(\{y_i\}\) to correct what’s technically predicted by the Fokker-Planck equation/reduced order equation. This is the same idea where one tries to use PDE solutions to solve inverse problems in physics-informed neural nets. We propose to learn the defect term as follows, put \(S(y,t) := -\widehat{\mathcal{E}}f_y\).
If \(S(y,t)\) were known exactly, one would use operator splitting (e.g. Lie-Trotter) to solve:
\[\begin{cases} \partial_t f_y^{(1)} + \widehat{\mathcal{L}}f_y^{(1)} = 0 \\ f_y^{(1)}(y, t_n) = f_y(y,t_n) \end{cases}\]where \(f_y\) is an initial condition at \(t=t_n\), we are propagating the PDE to time \(t_{n+1}\). And then solve
\[\begin{cases} \partial_t f_y^{(2)} = S(y,t) \\ f_y^{(2)}(y, t_n) = f_y^{(1)}(y, t_{n+1}) \end{cases}\]\(f_y^{(2)}(y, t_{n+1})\) is our numerical solution at \(t=t_{n+1}\).
So we reviewed Lie-Trotter scheme. But, in the practical problem that we described, we do not know \(S(y,t)\). However, we can reverse-engineer it by considering the following
\[\begin{cases} \partial_t f_y^{(1)} + \widehat{\mathcal{L}}f_y^{(1)} = 0 \\ f_y^{(1)}(y, t_n) = \widehat{f}_y(y,t_n) \end{cases}\]where \(\widehat{f}_y\) is an empirical distribution from observed data \(\{y_i\}\). Then the model defect can be computed just from numerical differentiation:
\[\frac{\widehat{f}_{y}(y, t_{n+k}) - f_y^{(1)}(y, t_{n+k})}{k\Delta t} = \frac12(\underbrace{S(y, t_n)}_{\text{available at step $n$}} + S(y, t_{n+k}))\]then:
\[S(y, t_{n+k}) = 2\bigg(\frac{\widehat{f}_{y}(y, t_{n+k}) - f_y^{(1)}(y, t_{n+k})}{k\Delta t}\bigg) - S(y, t_n)\]This method is documented in my related papers (2024)
One observes some histogram (e.g. sample attribute from a population), and suspects that this histogram changes in time according to some PDE model, and wish to make predictions cheaply.
Obtaining samples by accept-rejection sampling cheaply of a pre-defined quantity of interest depending on a set of stochastic constraints, and computing probabilistic profiles (e.g. moments), which is not possible with brute-force simulation, e.g. damages in rare cascading failures.
And a vectorized version below, with clearer outputs:
where \(g:\mathbb{R}^d \rightarrow \mathbb{R}^d\), parameterized by some parameters \(\theta\), which we do not explicitly consider here. This form is quite common, especially if one is working with residual connections in neural net layers.
The Jacobian matrix with of \(f\) with respect to input \(z\) is given by:
\[\frac{df}{dz} = I_d + \frac{dg_{\theta}}{dz}\]or in an alternative notation
\[J_f = I_d + J_g\]where we assume \(g(\cdot;\theta) = g_{\theta}(\cdot)\) is Lipschitz with constant \(<1\). Then necessarily, \(\det J_f > 0\). We can then apply the identity that \(\log \det(A) = \text{tr} \log A\) for \(A\) nonsingular. In particular:
\[\text{tr}(\log J_f) = \text{tr}(\log(I + J_g)) = \sum_{k=1}^{\infty}(-1)^{k+1}\frac{\text{tr}(J_g^k)}{k}\]which converges with the Lipschitz constraint.
Without delving into the details, we claim that a classic trick (1990) to compute the trace of a matrix \(B\) is to approximate:
\[\text{tr}(B) = \mathbb{E}[v^TBv] \approx \frac{1}{m}\sum_{i=1}^m(v^{(i)})^T(Bv^{(i)})\]where \(v\) denotes a random vector, \(v_i\) is a realization.
\[\mathbb{E}[v] = 0, \text{Var}[v] = I\]An example of such \(v\) can be that each entry is independently drawn from a Radamacher distribution.
Accepting that the above approach works, the situation in which one wants to consider such an approximation is when \(d\) is exceptionally large, and \(B\) is dense. In this case, computing the trace of \(B^k\) is at least the cost of doing eigenvalue decomposition, which is \(O(d^3)\).
In the case of the estimator, suppose \(O(m)\) samples are sufficient for convergence, computing the trace of \(B^k\) amounts to repeated matrix-vector multuplications, which is only \(O(kmd^2)\) in total, since we’d have to accumulate the multiplied random vectors from indices \(j=1, \ldots, k-1, k\).
We provide an implementation below and show that it converges. In this quick note, we did not discuss the following good questions:
Convergence order of the series.
Special structures of Jacobian \(J_g\) and speedups.
Distributions of \(v\) and associated biases, variances.
In this note, I provide a MATLAB implementation of finite volume method for solving 1d advection equation. Specifically, it is implementing the Lax-Wendroff scheme with MC limiter from LeVeque (Finite Volume Methods), Section 9.5.2 and onwards. This code assumes that the PDE is conservative, which means
\[\partial_t f + \partial_x(u(x)f) = 0\]and is solved with a padding of ng
ghost cells on either side of the mesh boundaries. For stability, temporal integration needs the time step to satisfy CFL condition.
function ff = laxWen1d(f, f_ind, nx, v, dx, dt)
% Takes one time step of 1d conservative advection equation
% via lax-wendroff with MC limiter
% Reference: LeVeque, Randall J. Finite volume methods for hyperbolic problems.
% Vol. 31. Cambridge university press, 2002.
% Input:
% f := solution at current times step, (nx+2*ng,1) vector
% f_ind := indices of f of non-ghost cells
% nx := number of non-ghost cells
% v := variable advec. coeff., (nx+1,1) vector
% defined on left cell edges (1-1/2):(nx+1/2)
% dt and dx are time and spatial step
% Output:
% f0 := (nx,1) solution at next time step on non-ghost cells
% Positive and negative speeds
indp = find(v>0); indm = find(v<0);
vp = zeros(nx+1,1); vm = vp;
vp(indp) = v(indp); vm(indm) = v(indm);
% 1st-order right and left going flux differences
% LeVeque sect. 9.5.2 The conservative equation
% At cell i: Apdq(i-1/2) = right going flux = F(i) - F(i-1/2),
% Amdq(i+1/2) = left going flux = F(i+1/2) - F(i),
% where F is numerical flux.
% Upwind edge flux: F(i-1/2) = vp(i-1/2)f(i-1) + vm(i-1/2)f(i),
% F(i-1/2) = vp(i-1/2)f(i-1) + vm(i-1/2)f(i).
% Apdq(i-1/2)= F(i) - F(i-1/2), Amdq(i+1/2) = F(i+1/2) - F(i);
% F(i-1/2) = vp(i-1/2)f(i-1) + vm(i-1/2)f(i)
% F(i+1/2) = vp(i+1/2)f(i) + vm(i+1/2)f(i+1)
% F(i)
Flux_i = 0;
% F(i-1/2)
Flux_m = vp(1:nx).*f(f_ind-1) + vm(1:nx).*f(f_ind);
% F(i+1/2)
Flux_p = vp(2:nx+1).*f(f_ind) + vm(2:nx+1).*f(f_ind+1);
% Apdq(i-1/2) and Amdq(i+1/2)
Apdq = Flux_i - Flux_m; Amdq = Flux_p - Flux_i;
% W = wave with speed u; p = i+1/2, m = i-1/2
Wp = f(f_ind+1) - f(f_ind); Wm = f(f_ind) - f(f_ind-1);
% theta's for limiter: LeVeque book sect. 9.13
% theta_i-1/2 = q(I) - q(I-1) / Wm , I = i-1 v_i-1/2>=0, =i+1 v_i-1/2<0
% theta_i+1/2 = q(I+1) - q(I) / Wp , I = i-1 v_i+1/2>=0, =i+1 v_i+1/2<0
% Allocate for limiters
Thm = zeros(nx,1); Thp = Thm;
% At i-1/2
xsm = indm(indm<nx+1); xsp = indp(indp<nx+1);
Thm(xsm) = (f(f_ind(xsm)+1) - f(f_ind(xsm)))./Wm(xsm); % negative speed
Thm(xsp) = (f(f_ind(xsp)-1) - f(f_ind(xsp)-2))./Wm(xsp); % positive speed
% At i+1/2
xsm = indm(indm>1)-1; xsp = indp(indp>1)-1;
Thp(xsm) = (f(f_ind(xsm)+2) - f(f_ind(xsm)+1))./Wp(xsm); % negative speed
Thp(xsp) = (f(f_ind(xsp)) - f(f_ind(xsp)-1))./Wp(xsp); % positive speed
% MC limiter: LeVeque sect. 6.12 TVD Limiters eqn (6.39a)
phip = max(0,min(min((1+Thp)/2,2),2*Thp));
phim = max(0,min(min((1+Thm)/2,2),2*Thm));
% mW = modified wave, LeVeque sect. 9.13 eqn (9.69)
mWp = phip.*Wp; mWm = phim.*Wm;
% 2nd-order limited corrections: LeVeque sect. 6.15 eqn (6.60)
Fp = 0.5*abs(v(2:nx+1)).*(1 - (dt/dx)*abs(v(2:nx+1))).*mWp;
Fm = 0.5*abs(v(1:nx)).*(1 - (dt/dx)*abs(v(1:nx))).*mWm;
ff = f(f_ind) - (dt/dx)*(Apdq + Amdq + Fp - Fm);
end
If you save the above code to a file, you can run the following experiments
% testing advection with u(x) = x - x^3
% define parameters
% final time
ttf = 1;
% grid size
dy = 0.01;
% spatial grid
yg = (-3:dy:3)';
% number of ghost cells
ng = 2;
% number of effective grid points
ny = length(yg) - 2*ng;
% indexer
idy_f = (ng+1):(ny+ng);
y = yg(idy_f);
% cell centers
ye = [y - dy/2; y(end) + dy/2];
uu = ye - ye.^3;
% refined time step for CFL
dtt = dx/max(abs(uu));
ntt = ceil(ttf/dtt) + 1;
tt = linspace(0, ttf, ntt);
dtt = ttf/(ntt-1);
f = zeros(ny, ntt);
% initial condition
f(:,1) = 3*exp(-0.5*(3*y).^2)/sqrt(2*pi);
for ii = 2:ntt
tmp = [zeros(ng,1); f(:, ii-1); zeros(ng,1)];
% take one step
f(:,ii) = laxWen1d(tmp, idy_f, ny, uu, dy, dtt);
end
You should expect to see the following solutions:
There is another more interesting example where the solution is approximating a fast-changing switch (continuous approximation to indicator). This PDE is very stiff, with \(\Delta t < 10^{-7}\). We provide the code and solutions below.
ttf = .02;
dy = 0.001;
yg = (-0.02:dy:1.02)';
ng = 2;
ny = length(yg) - 2*ng;
idy_f = (ng+1):(ny+ng);
y = yg(idy_f);
ye = [y - dy/2; y(end) + dy/2];
R = 0.0135; D = 1e-4;
uu = 2*R*(-exp(-20*ye) + exp(-200*ye) + exp(20*(ye - 1))...
- exp(200*(ye - 1)) - 0.2 ) / D;
dtt = dy/max(abs(uu));
ntt = ceil(ttf/dtt) + 1;
tt = linspace(0, ttf, ntt);
dtt = ttf/(ntt-1);
f = zeros(ny, ntt);
f(:,1) = 1e2*exp(-0.5*(1e2*(y-.9195)).^2)/sqrt(2*pi);
%f(:,1) = 1/(ye(end) - ye(1));
for ii = 2:ntt
if mod(ii,10000)==0
disp(ii)
end
tmp = [zeros(ng,1); f(:, ii-1); zeros(ng,1)];
f(:,ii) = laxWen1d(tmp, idy_f, ny, uu, dy, dtt);
end