From 15cb401d67563db4fada405874ad90127b7e6f95 Mon Sep 17 00:00:00 2001 From: Arthur Conmy Date: Sun, 1 Nov 2020 15:30:55 +0000 Subject: [PATCH 01/14] normalisable implies norm is non-zero --- IB_M/quantum_mechanics.synctex(busy) | 0 IB_M/quantum_mechanics.tex | 5 ++++- 2 files changed, 4 insertions(+), 1 deletion(-) create mode 100644 IB_M/quantum_mechanics.synctex(busy) diff --git a/IB_M/quantum_mechanics.synctex(busy) b/IB_M/quantum_mechanics.synctex(busy) new file mode 100644 index 0000000..e69de29 diff --git a/IB_M/quantum_mechanics.tex b/IB_M/quantum_mechanics.tex index c609be0..aaedfbf 100644 --- a/IB_M/quantum_mechanics.tex +++ b/IB_M/quantum_mechanics.tex @@ -1007,10 +1007,13 @@ \subsubsection*{Cauchy-Schwarz inequality} \|\psi + \lambda \phi\|^2 &= (\psi + \lambda \phi, \psi + \lambda \phi) \\ &= (\psi, \psi) + \lambda(\psi, \phi) + \lambda^*(\phi, \psi) + |\lambda|^2 (\phi, \phi) \geq 0. \end{align*} - This is true for any complex $\lambda$. The $\phi = 0$ case is trivial. Otherwise, set + This is true for any complex $\lambda$. Set \[ \lambda = -\frac{(\phi, \psi)}{\|\phi\|^2}. \] + + Which is always well-defined since $\phi$ is normalizable. + Then the above equation becomes \[ \|\psi\|^2 - \frac{|(\psi, \phi)|^2}{\|\phi\|^2} \geq 0. From f45d0eb9b1bab49a530f12d081137007c1b29d6d Mon Sep 17 00:00:00 2001 From: Arthur Conmy Date: Sun, 1 Nov 2020 15:31:38 +0000 Subject: [PATCH 02/14] cleaned up some clutter --- IB_M/quantum_mechanics.synctex(busy) | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 IB_M/quantum_mechanics.synctex(busy) diff --git a/IB_M/quantum_mechanics.synctex(busy) b/IB_M/quantum_mechanics.synctex(busy) deleted file mode 100644 index e69de29..0000000 From 9b0a2f3b638304516c555eef003f26e71bf2abc0 Mon Sep 17 00:00:00 2001 From: Arthur Conmy Date: Sun, 1 Nov 2020 15:33:38 +0000 Subject: [PATCH 03/14] typo --- IB_M/quantum_mechanics.tex | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/IB_M/quantum_mechanics.tex b/IB_M/quantum_mechanics.tex index aaedfbf..8011477 100644 --- a/IB_M/quantum_mechanics.tex +++ b/IB_M/quantum_mechanics.tex @@ -1009,10 +1009,10 @@ \subsubsection*{Cauchy-Schwarz inequality} \end{align*} This is true for any complex $\lambda$. Set \[ - \lambda = -\frac{(\phi, \psi)}{\|\phi\|^2}. + \lambda = -\frac{(\phi, \psi)}{\|\phi\|^2} \] - Which is always well-defined since $\phi$ is normalizable. + which is always well-defined since $\phi$ is normalizable. Then the above equation becomes \[ From fe565995f899dc8b66274e8a29552c27e7cbd85e Mon Sep 17 00:00:00 2001 From: Arthur Conmy Date: Sun, 1 Nov 2020 17:22:01 +0000 Subject: [PATCH 04/14] few quantum typos and consistent style things --- IB_M/quantum_mechanics.tex | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/IB_M/quantum_mechanics.tex b/IB_M/quantum_mechanics.tex index 8011477..c382557 100644 --- a/IB_M/quantum_mechanics.tex +++ b/IB_M/quantum_mechanics.tex @@ -264,7 +264,7 @@ \subsection{Operators} \begin{tabular}{rll} position & $\hat{x} = x$ & $\hat{x} \psi = x\psi(x)$\\ momentum & $\hat{p} = -i\hbar \frac{\partial}{\partial x}$ & $\hat{p}\psi = -i\hbar \psi'(x)$\\ - energy & $H = \frac{\hat{p}^2}{2m} + V(\hat{x})$ & $H\psi = -\frac{\hbar^2}{2m} \frac{\partial^2}{\partial x^2}\psi + V(x)\psi(x)$ + energy & $H = \frac{\hat{p}^2}{2m} + V(\hat{x})$ & $H\psi = -\frac{\hbar^2}{2m} \psi''(x) + V(x)\psi(x)$ \end{tabular} \end{center} The final $H$ is called the Hamiltonian, where $m$ is the mass and $V$ is the potential. We see that the Hamiltonian is just the kinetic energy $\frac{p^2}{2m}$ and the potential energy $V$. There will be more insight into why the operators are defined like this in IIC Classical Dynamics and IID Principles of Quantum Mechanics. @@ -1051,14 +1051,14 @@ \subsection{Ehrenfest's theorem} \end{align*} But we know \[ - \hat{x}H - H\hat{x}\Psi = -\frac{\hbar^2}{2m}(x\Psi'' - (x\Psi)'') + (xV\Psi - Vx\Psi) = -\frac{\hbar^2}{m}\Psi' = \frac{i\hbar}{m}\hat{p}\Psi. + (\hat{x}H - H\hat{x})\Psi = -\frac{\hbar^2}{2m}(x\Psi'' - (x\Psi)'') + (xV\Psi - Vx\Psi) = -\frac{\hbar^2}{m}\Psi' = \frac{i\hbar}{m}\hat{p}\Psi. \] So done. The second part is similar. We have \begin{align*} \frac{\d}{\d t}\bra \hat{p}\ket_\Psi &= (\dot{\Psi}, \hat{p}\Psi) + (\Psi, \hat{p}\dot{\Psi})\\ - &= \left(\frac{1}{\hbar}H\Psi, \hat{p}\Psi\right) + \left(\Psi, \hat{p}\left(\frac{1}{i\hbar}H\right)\Psi\right)\\ + &= \left(\frac{1}{i\hbar}H\Psi, \hat{p}\Psi\right) + \left(\Psi, \hat{p}\left(\frac{1}{i\hbar}H\right)\Psi\right)\\ \intertext{Since $H$ is Hermitian, we can move it around and get} &= -\frac{1}{i\hbar}(\Psi, H(\hat{p}\Psi)) + \frac{1}{i\hbar}(\Psi, \hat{p}(H\Psi))\\ &= \frac{1}{i\hbar}(\Psi, (\hat{p} H - H\hat{p}) \Psi). From f9200197e6cca152248fa3246f0e45cf72a2ac2a Mon Sep 17 00:00:00 2001 From: Arthur Date: Thu, 24 Dec 2020 22:43:31 +0000 Subject: [PATCH 05/14] edits to stats first section --- IB_L/statistics.tex | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/IB_L/statistics.tex b/IB_L/statistics.tex index 944790d..8503411 100644 --- a/IB_L/statistics.tex +++ b/IB_L/statistics.tex @@ -96,8 +96,8 @@ \subsection{Mean squared error} \end{defi} We can express the mean squared error in terms of the variance and bias: \begin{align*} - \E_\theta[(\hat{\theta} - \theta)^2] &= \E_\theta[(\hat{\theta} - E_\theta(\hat{\theta}) + \E_\theta(\hat{\theta}) - \theta)^2]\\ - &= \E_\theta[(\hat{\theta} - E_\theta(\hat{\theta}))^2] + [\E_\theta(\hat{\theta}) - \theta]^2 + 2[\E_\theta(\hat{\theta}) - \theta]\E_\theta[\hat{\theta} - \E_\theta(\hat{\theta})]\\ + \E_\theta[(\hat{\theta} - \theta)^2] &= \E_\theta[(\hat{\theta} - \E_\theta(\hat{\theta}) + \E_\theta(\hat{\theta}) - \theta)^2]\\ + &= \E_\theta[(\hat{\theta} - \E_\theta(\hat{\theta}))^2] + [\E_\theta(\hat{\theta}) - \theta]^2 + 2\E_{\theta}[\E_\theta(\hat{\theta}) - \theta]\underbrace{\E_\theta[\hat{\theta} - \E_\theta(\hat{\theta})]}_{0}\\ &= \var(\hat{\theta}) + \bias^2(\hat{\theta}). \end{align*} If we are aiming for a low mean squared error, sometimes it could be preferable to have a biased estimator with a lower variance. This is known as the ``bias-variance trade-off''. @@ -236,7 +236,7 @@ \subsection{Sufficiency} Note that sufficient statistics are not unique. If $T$ is sufficient for $\theta$, then so is any 1-1 function of $T$. $\mathbf{X}$ is always sufficient for $\theta$ as well, but it is not of much use. How can we decide if a sufficient statistic is ``good''? -Given any statistic $T$, we can partition the sample space $\mathcal{X}^n$ into sets $\{\mathbf{x}\in \mathcal{X}: T(\mathbf{X}) = t\}$. Then after an experiment, instead of recording the actual value of $\mathbf{x}$, we can simply record the partition $\mathbf{x}$ falls into. If there are less partitions than possible values of $\mathbf{x}$, then effectively there is less information we have to store. +Given any statistic $T$, we can partition the sample space $\mathcal{X}^n$ into sets $\{\mathbf{x}\in \mathcal{X}: T(\mathbf{x}) = t\}$. Then after an experiment, instead of recording the actual value of $\mathbf{x}$, we can simply record the partition $\mathbf{x}$ falls into. If there are less partitions than possible values of $\mathbf{x}$, then effectively there is less information we have to store. If $T$ is sufficient, then this data reduction does not lose any information about $\theta$. The ``best'' sufficient statistic would be one in which we achieve the maximum possible reduction. This is known as the \emph{minimal sufficient statistic}. The formal definition we take is the following: @@ -318,7 +318,7 @@ \subsection{Sufficiency} We start with an easy estimator $\theta$ is $\tilde{\theta} = 1_{X_1 = 0}$, which is unbiased (i.e.\ if we observe nothing in the first observation period, we assume the event is impossible). Then \begin{align*} - \E[\tilde{\theta}\mid T = t] &= \P\left(X_1 = 0\mid \sum_1^n X_i = t\right)\\ + \E[\tilde{\theta}\mid T = t] &= \P\left(X_1 = 0 \mid \sum_1^n X_i = t\right)\\ &= \frac{\P(X_1 = 0)\P(\sum_2^n X_i = t)}{\P(\sum_1^n X_i = t)}\\ &= \left(\frac{n - 1}{n}\right)^t. \end{align*} @@ -340,7 +340,7 @@ \subsection{Sufficiency} \subsection{Likelihood} There are many different estimators we can pick, and we have just come up with some criteria to determine whether an estimator is ``good''. However, these do not give us a systematic way of coming up with an estimator to actually use. In practice, we often use the \emph{maximum likelihood estimator}. -Let $X_1, \cdots , X_n$ be random variables with joint pdf/pmg $f_\mathbf{X}(\mathbf{x}\mid \theta)$. We observe $\mathbf{X} = \mathbf{x}$. +Let $X_1, \cdots , X_n$ be random variables with joint pdf/pmf $f_\mathbf{X}(\mathbf{x}\mid \theta)$. We observe $\mathbf{X} = \mathbf{x}$. \begin{defi}[Likelihood] For any given $\mathbf{x}$, the \emph{likelihood} of $\theta$ is $\like(\theta) = f_\mathbf{X}(\mathbf{x} \mid \theta)$, regarded as a function of $\theta$. The \emph{maximum likelihood estimator} (mle) of $\theta$ is an estimator that picks the value of $\theta$ that maximizes $\like (\theta)$. @@ -437,7 +437,7 @@ \subsection{Confidence intervals} We know $\bar X \sim N(\theta, \frac{1}{n})$, so that $\sqrt{n}(\bar X - \theta)\sim N(0, 1)$. - Let $z_1, z_2$ be such that $\phi(z_2) - \phi(z_1) = 0.95$, where $\phi$ is the standard normal distribution function. + Let $z_1, z_2$ be such that $\Phi(z_2) - \Phi(z_1) = 0.95$, where $\Phi$ is the standard normal distribution function. We have $\P[z_1 < \sqrt{n}(\bar X - \theta) < z_2] = 0.95$, which can be rearranged to give \[ @@ -457,7 +457,7 @@ \subsection{Confidence intervals} \end{itemize} Usually $c_1, c_2$ are percentage points from a known standardised distribution, often equitailed. For example, we pick $2.5\%$ and $97.5\%$ points for a $95\%$ confidence interval. We could also use, say $0\%$ and $95\%$, but this generally results in a wider interval. -Note that if $(A(\mathbf{x}), B(\mathbf{x}))$ is a $100\gamma\%$ confidence interval for $\theta$, and $T(\theta)$ is a monotone increasing function of $\theta$, then $(T(A(\mathbf{x})), T(B(\mathbf{x})))$ is a $100\gamma\%$ confidence interval for $T(\theta)$. +Note that if $(A(\mathbf{X}), B(\mathbf{X}))$ is a $100\gamma\%$ confidence interval for $\theta$, and $T(\theta)$ is a monotone increasing function of $\theta$, then $(T(A(\mathbf{X})), T(B(\mathbf{X})))$ is a $100\gamma\%$ confidence interval for $T(\theta)$. \begin{eg} Suppose $X_1, \cdots, X_{50}$ are iid $N(0, \sigma^2)$. Find a $99\%$ confidence interval for $\sigma^2$. @@ -489,7 +489,7 @@ \subsection{Confidence intervals} By the Central Limit theorem, $\hat{p}$ is approximately $N(p, p(1 - p)/n)$ for large $n$. - So $\displaystyle \frac{\sqrt{n}(\hat{p} - p)}{\sqrt{p(1 - p)}}$ is approximately $N(0, 1)$ for large $n$. So we have + So $\displaystyle \frac{\sqrt{n}(\hat{p} - p)}{\sqrt{p(1 - p)}}$ is approximately $N(0, 1)$ for large $n$. So letting $z_{(1-\gamma) / 2}$ be the solution to $\Phi(z_{(1-\gamma) / 2}) - \Phi(-z_{(1-\gamma) / 2}) = 1 - \gamma$, we have \[ \P\left(\hat p - z_{(1 - \gamma)/2}\sqrt{\frac{p(1 - p)}{n}} < p < \hat{p} + z_{(1 - \gamma)/2}\sqrt{\frac{p(1 - p)}{n}}\right)\approx \gamma. \] @@ -527,7 +527,7 @@ \subsection{Confidence intervals} \end{eg} \subsection{Bayesian estimation} -So far we have seen the \emph{frequentist} approach to a statistical inference, i.e.\ inferential statements about $\theta$ are interpreted in terms of repeat sampling. For example, the confidence interval is what's the probability that the interval will contain $\theta$, not the probability that $\theta$ lies in the interval. +So far we have seen the \emph{frequentist} approach to a statistical inference, i.e.\ inferential statements about $\theta$ are interpreted in terms of repeat sampling. For example, the percentage confidence in a confidence interval is the probability that the interval will contain $\theta$, not the probability that $\theta$ lies in the interval. In contrast, the Bayesian approach treats $\theta$ as a random variable taking values in $\Theta$. The investigator's information and beliefs about the possible values of $\theta$ before any observation of data are summarised by a \emph{prior distribution} $\pi(\theta)$. When $\mathbf{X} = \mathbf{x}$ are observed, the extra information about $\theta$ is combined with the prior to obtain the \emph{posterior distribution} $\pi(\theta\mid \mathbf{x})$ for $\theta$ given $\mathbf{X} = \mathbf{x}$. @@ -560,9 +560,9 @@ \subsection{Bayesian estimation} Let $X = 1$ denote the event that I observe a head, $X = 0$ if a tail. Let $\theta$ denote the probability of a head. So $\theta$ is either 0.25, 0.5 or 0.75. - Our prior distribution is $\theta(\theta = 0.25) = \pi(\theta = 0.5) = \pi(\theta = 0.75) = 1/3$. + Our prior distribution is $\pi(\theta = 0.25) = \pi(\theta = 0.5) = \pi(\theta = 0.75) = 1/3$. - The probability mass function $f_X(x\mid \theta) = \theta^x(1 - \theta)^{1 - x}$. So we have to following results: + The probability mass function $f_X(x\mid \theta) = \theta^x(1 - \theta)^{1 - x}$. So we have the following results: \begin{center} \begin{tabular}{ccccc} \toprule @@ -605,7 +605,7 @@ \subsection{Bayesian estimation} This leads to a different conclusion than a frequentist analysis. Since nobody has died so far, the mle is $0$, which does not seem plausible. Using a Bayesian approach, we have a higher mean than $0$ because we take into account the data from other hospitals. \end{eg} -For this problem, a beta prior leads to a beta posterior. We say that the beta family is a \emph{conjugate} family of prior distribution for Bernoulli samples. +For this problem, a beta prior leads to a beta posterior. We say that the beta family is a \emph{conjugate} family of prior distributions for Bernoulli samples. Suppose that $a = b = 1$ so that $\pi (\theta) = 1$ for $0 < \theta < 1$ --- the uniform distribution. Then the posterior is $\betaD(\sum x_i + 1, n - \sum x_i + 1)$, with properties \begin{center} @@ -668,7 +668,7 @@ \subsection{Bayesian estimation} This occurs when each side is $1/2$. So $\hat{\theta}$ is the \emph{posterior median}. \begin{eg} - Suppose that $X_1, \cdots , X_n$ are iid $N(\mu, 1)$, and that a prior $\mu\sim N(0, \tau^{-2})$ for some $\tau^{-2}$. So $\tau$ is the certainty of our prior knowledge. + Suppose that $X_1, \cdots , X_n$ are iid $N(\mu, 1)$, and that a priori $\mu\sim N(0, \tau^{-2})$ for some $\tau^{-2}$. So $\tau$ is the certainty of our prior knowledge. The posterior is given by \begin{align*} @@ -676,7 +676,8 @@ \subsection{Bayesian estimation} &\propto \exp\left[-\frac{1}{2}\sum(x_i - \mu)^2\right]\exp\left[-\frac{\mu^2\tau^2}{2}\right]\\ &\propto \exp\left[-\frac{1}{2}(n + \tau^2)\left\{\mu - \frac{\sum x_i}{n + \tau^2}\right\}^2\right] \end{align*} - which is a normal distribution. So the posterior distribution of $\mu$ given $\mathbf{x}$ is a normal distribution with mean $\sum x_i/(n + \tau^2)$ and variance $1/(n + \tau^2)$. + + since we can regard $n$, $\tau$ and all the $x_i$ as constants in the normalisation term, and then complete the square with respect to $\mu$. So the posterior distribution of $\mu$ given $\mathbf{x}$ is a normal distribution with mean $\sum x_i/(n + \tau^2)$ and variance $1/(n + \tau^2)$. The normal density is symmetric, and so the posterior mean and the posterior media have the same value $\sum x_i/(n + \tau^2)$. @@ -698,7 +699,7 @@ \subsection{Bayesian estimation} Under absolute error loss, $\hat{\lambda}$ solves \[ - \int_0^{\lambda} \frac{(n + 1)^{\sum x_i + 1}\lambda^{\sum x_i}e ^{-(n + 1)\lambda}}{(\sum x_i)!}\;\d \lambda = \frac{1}{2}. + \int_0^{\hat{\lambda}} \frac{(n + 1)^{\sum x_i + 1}\lambda^{\sum x_i}e ^{-(n + 1)\lambda}}{\left(\sum x_i\right)!}\;\d \lambda = \frac{1}{2}. \] \end{eg} From 35af4fb8530369a4976f5071d1103743ae10e3dd Mon Sep 17 00:00:00 2001 From: Arthur Date: Thu, 24 Dec 2020 22:47:56 +0000 Subject: [PATCH 06/14] small quantum typos --- IB_M/quantum_mechanics.tex | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/IB_M/quantum_mechanics.tex b/IB_M/quantum_mechanics.tex index c382557..b9c81e3 100644 --- a/IB_M/quantum_mechanics.tex +++ b/IB_M/quantum_mechanics.tex @@ -1934,7 +1934,7 @@ \subsubsection*{Commuting observables} \] This is consistent with a ``generalized uncertainty relation'' \[ - (\Delta A)_\psi (\Delta B)_\psi =\geq \frac{1}{2} |\bra [A, B]\ket_\psi|, + (\Delta A)_\psi (\Delta B)_\psi \geq \frac{1}{2} |\bra [A, B]\ket_\psi|, \] since if we if have a state that is simultaneously an eigenstate for $A$ and $B$, then the uncertainties on the left would vanish. So $\bra [A, B]\ket_\psi = 0$. The proof of this relation is on example sheet 3. @@ -2349,7 +2349,7 @@ \subsection{Joint eigenstates for a spherically symmetric potential} \] Alternatively, this requires \[ - \int_0^\infty |\chi(r)|^2 \;d\; r < \infty. + \int_0^\infty |\chi(r)|^2 \;\d r < \infty. \] \begin{eg}[Three-dimensional well] From 6b6204603e1a5f30d7c1c98a28c25cc529cc0eae Mon Sep 17 00:00:00 2001 From: Arthur Date: Fri, 25 Dec 2020 21:44:17 +0000 Subject: [PATCH 07/14] removed extra line --- IB_L/statistics.tex | 2 -- 1 file changed, 2 deletions(-) diff --git a/IB_L/statistics.tex b/IB_L/statistics.tex index 8503411..77e668f 100644 --- a/IB_L/statistics.tex +++ b/IB_L/statistics.tex @@ -676,11 +676,9 @@ \subsection{Bayesian estimation} &\propto \exp\left[-\frac{1}{2}\sum(x_i - \mu)^2\right]\exp\left[-\frac{\mu^2\tau^2}{2}\right]\\ &\propto \exp\left[-\frac{1}{2}(n + \tau^2)\left\{\mu - \frac{\sum x_i}{n + \tau^2}\right\}^2\right] \end{align*} - since we can regard $n$, $\tau$ and all the $x_i$ as constants in the normalisation term, and then complete the square with respect to $\mu$. So the posterior distribution of $\mu$ given $\mathbf{x}$ is a normal distribution with mean $\sum x_i/(n + \tau^2)$ and variance $1/(n + \tau^2)$. The normal density is symmetric, and so the posterior mean and the posterior media have the same value $\sum x_i/(n + \tau^2)$. - This is the optimal estimator for both quadratic and absolute loss. \end{eg} From fdf0498b81f239685e4cfc95ae9ebe289489f7b3 Mon Sep 17 00:00:00 2001 From: Arthur Date: Fri, 25 Dec 2020 21:54:36 +0000 Subject: [PATCH 08/14] fixed extra space I think --- IB_M/quantum_mechanics.tex | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/IB_M/quantum_mechanics.tex b/IB_M/quantum_mechanics.tex index b9c81e3..77ad32b 100644 --- a/IB_M/quantum_mechanics.tex +++ b/IB_M/quantum_mechanics.tex @@ -1011,10 +1011,7 @@ \subsubsection*{Cauchy-Schwarz inequality} \[ \lambda = -\frac{(\phi, \psi)}{\|\phi\|^2} \] - - which is always well-defined since $\phi$ is normalizable. - - Then the above equation becomes + which is always well-defined since $\phi$ is normalizable, and then the above equation becomes \[ \|\psi\|^2 - \frac{|(\psi, \phi)|^2}{\|\phi\|^2} \geq 0. \] From 5b50093992298fecf6f0e710d54585f4e95e1437 Mon Sep 17 00:00:00 2001 From: Arthur Date: Fri, 25 Dec 2020 21:56:09 +0000 Subject: [PATCH 09/14] added back a line where needed I think --- IB_L/statistics.tex | 1 + 1 file changed, 1 insertion(+) diff --git a/IB_L/statistics.tex b/IB_L/statistics.tex index 77e668f..279f3e4 100644 --- a/IB_L/statistics.tex +++ b/IB_L/statistics.tex @@ -679,6 +679,7 @@ \subsection{Bayesian estimation} since we can regard $n$, $\tau$ and all the $x_i$ as constants in the normalisation term, and then complete the square with respect to $\mu$. So the posterior distribution of $\mu$ given $\mathbf{x}$ is a normal distribution with mean $\sum x_i/(n + \tau^2)$ and variance $1/(n + \tau^2)$. The normal density is symmetric, and so the posterior mean and the posterior media have the same value $\sum x_i/(n + \tau^2)$. + This is the optimal estimator for both quadratic and absolute loss. \end{eg} From ddfb4d629fa150d7dda7586ca5e25e8a34d90537 Mon Sep 17 00:00:00 2001 From: Arthur Date: Tue, 29 Dec 2020 21:53:11 +0000 Subject: [PATCH 10/14] updated stats section 2 --- IB_L/statistics.tex | 76 ++++++++++++++++++++++----------------------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/IB_L/statistics.tex b/IB_L/statistics.tex index 279f3e4..032285f 100644 --- a/IB_L/statistics.tex +++ b/IB_L/statistics.tex @@ -465,7 +465,7 @@ \subsection{Confidence intervals} So $R(\mathbf{X}, \sigma^2) = \sum_{i = 1}^{50} X_2^2/\sigma^2$ is a pivot. - Recall that $\chi_n^2(\alpha)$ is the upper $100\alpha\%$ point of $\chi_n^2$, i.e. + Recall that $\chi_n^2(\alpha)$ is the upper $100\alpha\%$ point of $\chi_n^2$, i.e.\ \[ \P(\chi_n^2 \leq \chi_n^2(\alpha)) = 1 - \alpha. \] @@ -515,7 +515,7 @@ \subsection{Confidence intervals} \begin{eg} Suppose $X_1, X_2$ are iid from $U(\theta - 1/2, \theta + 1/2)$. What is a sensible $50\%$ confidence interval for $\theta$? - We know that each $X_i$ is equally likely to be less than $\theta$ or greater than $\theta$. So there is $50\%$ chance that we get one observation on each side, i.e. + We know that each $X_i$ is equally likely to be less than $\theta$ or greater than $\theta$. So there is $50\%$ chance that we get one observation on each side, i.e.\ \[ \P_\theta(\min(X_1, X_2) \leq \theta \leq \max(X_1, X_2)) = \frac{1}{2}. \] @@ -714,7 +714,7 @@ \section{Hypothesis testing} \item A coin has $\P(\text{Heads}) = \theta$, and is thrown independently $n$ times. We could have $H_0:\theta = \frac{1}{2}$ versus $H_1: \theta = \frac{3}{4}.$ \item Suppose $X_1, \cdots, X_n$ are iid discrete random variables. We could have $H_0:$ the distribution is Poisson with unknown mean, and $H_1:$ the distribution is not Poisson. \item General parametric cases: Let $X_1, \cdots , X_n$ be iid with density $f(x\mid \theta)$. $f$ is known while $\theta$ is unknown. Then our hypotheses are $H_0: \theta\in \Theta_0$ and $H_1:\theta\in \Theta_1$, with $\Theta_0\cap \Theta_1 = \emptyset$. - \item We could have $H_0: f = f_0$ and $H_1 = f = f_1$, where $f_0$ and $f_1$ are densities that are completely specified but do not come form the same parametric family. + \item We could have $H_0: f = f_0$ and $H_1: f = f_1$, where $f_0$ and $f_1$ are densities that are completely specified but do not come form the same parametric family. \end{itemize} \end{eg} @@ -777,9 +777,9 @@ \subsection{Simple hypotheses} \[ \beta = \P(\mathbf{X}\not\in C\mid f_1) = \int_{\bar C}f_1(\mathbf{x})\;\d \mathbf{x}. \] - Let $C^*$ be the critical region of any other test with size less than or equal to $\alpha$. Let $\alpha^* = \P(X\in C^*\mid f_0)$ and $\beta^* = \P(\mathbf{X}\not\in C^*\mid f_1)$. We want to show $\beta \leq \beta^*$. + Let $C^*$ be the critical region of any other test with size less than or equal to $\alpha$. Let $\alpha^* = \P(\mathbf{X} \in C^*\mid f_0)$ and $\beta^* = \P(\mathbf{X}\not\in C^*\mid f_1)$. We want to show $\beta \leq \beta^*$. - We know $\alpha^* \leq \alpha$, ie + We know $\alpha^* \leq \alpha$, i.e \[ \int_{C^*}f_0(\mathbf{x})\;\d \mathbf{x}\leq \int_Cf_0(\mathbf{x}) \;\d \mathbf{x}. \] @@ -825,7 +825,7 @@ \subsection{Simple hypotheses} \end{tikzpicture} \end{center} \end{proof} -Here we assumed the $f_0$ and $f_1$ are continuous densities. However, this assumption is need just to ensure that the likelihood ratio test of exactly size $\alpha$ exists. Even with non-continuous distributions, the likelihood ratio test is still a good idea. In fact, you will show in the example sheets that for a discrete distribution, as long as a likelihood ratio test of exactly size $\alpha$ exists, the same result holds. +Here we assumed the $f_0$ and $f_1$ are continuous densities. However, this assumption is only needed to ensure that the likelihood ratio test of exactly size $\alpha$ exists. Even with non-continuous distributions, the likelihood ratio test is still a good idea. In fact, you will show in the example sheets that for a discrete distribution, as long as a likelihood ratio test of exactly size $\alpha$ exists, the same result holds. \begin{eg} Suppose $X_1, \cdots, X_n$ are iid $N(\mu, \sigma_0^2)$, where $\sigma_0^2$ is known. We want to find the best size $\alpha$ test of $H_0: \mu = \mu_0$ against $H_1: \mu = \mu_1$, where $\mu_0$ and $\mu_1$ are known fixed values with $\mu_1 > \mu_0$. Then @@ -833,7 +833,7 @@ \subsection{Simple hypotheses} \Lambda_\mathbf{x}(H_0; H_1) &= \frac{(2\pi\sigma_0^2)^{-n/2}\exp\left(-\frac{1}{2\sigma^2_0}\sum(x_i - \mu_1)^2\right)}{(2\pi\sigma_0^2)^{-n/2}\exp\left(-\frac{1}{2\sigma^2_0}\sum(x_i - \mu_0)^2\right)}\\ &= \exp\left(\frac{\mu_1 - \mu_0}{\sigma_0^2}n\bar x + \frac{n(\mu_0^2 - \mu_1^2)}{2\sigma_0^2}\right). \end{align*} - This is an increasing function of $\bar x$, so for any $k$, $\Lambda_x > k\Leftrightarrow \bar x > c$ for some $c$. Hence we reject $H_0$ if $\bar x > c$, where $c$ is chosen such that $\P(\bar X > c \mid H_0) = \alpha$. + This is an increasing function of $\bar x$, so for any $k$, $\Lambda_\mathbf{x} > k\Leftrightarrow \bar x > c$ for some $c$. Hence we reject $H_0$ if $\bar x > c$, where $c$ is chosen such that $\P(\bar X > c \mid H_0) = \alpha$. Under $H_0$, $\bar X \sim N(\mu_0, \sigma_0^2/n)$, so $Z = \sqrt{n}(\bar X - \mu_0)/\sigma_0 \sim N(0, 1)$. @@ -865,7 +865,7 @@ \subsection{Composite hypotheses} \begin{defi}[Power function] The \emph{power function} is \[ - W(\theta) = \P(\mathbf{X}\in C\mid \theta) = \P(\text{reject }H_0\mid \theta), + W(\theta) = \P(\mathbf{X}\in C\mid \theta) = \P(\text{reject }H_0\mid \theta). \] \end{defi} We want $W(\theta)$ to be small on $H_0$ and large on $H_1$. @@ -873,7 +873,7 @@ \subsection{Composite hypotheses} \begin{defi}[Size] The \emph{size} of the test is \[ - \alpha =\sup_{\theta\in \Theta_0}W(\theta), + \alpha =\sup_{\theta\in \Theta_0}W(\theta). \] \end{defi} This is the worst possible size we can get. @@ -902,7 +902,7 @@ \subsection{Composite hypotheses} First consider testing $H_0': \mu = \mu_0$ against $H_1': \mu = \mu_1$, where $\mu_1 > \mu_0$. The Neyman-Pearson test of size $\alpha$ of $H_0'$ against $H_1'$ has \[ - C = \left\{x: \frac{\sqrt{n}(\bar x - \mu_0)}{\sigma_0} > z_\alpha\right\}. + C = \left\{\mathbf{x}: \frac{\sqrt{n}(\bar x - \mu_0)}{\sigma_0} > z_\alpha\right\}. \] We show that $C$ is in fact UMP for the composite hypotheses $H_0$ against $H_1$. For $\mu\in \R$, the power function is \begin{align*} @@ -917,7 +917,7 @@ \subsection{Composite hypotheses} \] So the first condition is satisfied. - For the second condition, observe that for any $\mu > \mu_0$, the Neyman Pearson size $\alpha$ test of $H_0'$ vs $H_1'$ has critical region $C$. Let $C^*$ and $W^*$ belong to any other test of $H_0$ vs $H_1$ of size $\leq \alpha$. Then $C^*$ can be regarded as a test of $H_0'$ vs $H_1'$ of size $\leq \alpha$, and the Neyman-Pearson lemma says that $W^*(\mu_1) \leq W(\mu_1)$. This holds for all $\mu_1 > \mu_0$. So the condition is satisfied and it is UMP. + For the second condition, observe that for any $\mu > \mu_0$, the Neyman-Pearson size $\alpha$ test of $H_0'$ vs $H_1'$ has critical region $C$. Let $C^*$ and $W^*$ belong to any other test of $H_0$ vs $H_1$ of size $\leq \alpha$. Then $C^*$ can be regarded as a test of $H_0'$ vs $H_1'$ of size $\leq \alpha$, and the Neyman-Pearson lemma says that $W^*(\mu_1) \leq W(\mu_1)$. This holds for all $\mu_1 > \mu_0$. So the condition is satisfied and it is UMP. \end{eg} We now consider likelihood ratio tests for more general situations. \begin{defi}[Likelihood of a composite hypothesis] @@ -937,7 +937,7 @@ \subsection{Composite hypotheses} Here $\Theta_0 = \{\mu_0\}$ and $\Theta = \R$. - For the denominator, we have $\sup_\Theta f(\mathbf{x}\mid \mu) = f(\mathbf{x}\mid \hat{\mu})$, where $\hat{\mu}$ is the mle. We know that $\hat{\mu} = \bar x$. Hence + For the numerator, we have $\sup_\Theta f(\mathbf{x}\mid \mu) = f(\mathbf{x}\mid \hat{\mu})$, where $\hat{\mu}$ is the mle. We know that $\hat{\mu} = \bar x$. Hence \[ \Lambda_\mathbf{x}(H_0; H_1) = \frac{(2\pi\sigma_0^2)^{-n/2}\exp\left(-\frac{1}{2\sigma^2_0}\sum(x_i - \bar x)^2\right)}{(2\pi\sigma_0^2)^{-n/2}\exp\left(-\frac{1}{2\sigma^2_0}\sum(x_i - \mu_0)^2\right)}. \] @@ -957,9 +957,9 @@ \subsection{Composite hypotheses} \[ \left|\frac{\sqrt{n}(\bar x - \mu_0)}{\sigma_0}\right| > z_{\alpha/2}. \] - Alternatively, since $\displaystyle \frac{n(\bar X - \mu_0)}{\sigma_0^2}\sim \chi_1^2$, we reject $H_0$ if + Alternatively, since $\displaystyle \frac{n(\bar X - \mu_0)^2}{\sigma_0^2}\sim \chi_1^2$, we reject $H_0$ if \[ - \frac{n(\bar X - \mu_0)^2}{\sigma_0^2} > \chi_1^2(\alpha), + \frac{n(\bar x - \mu_0)^2}{\sigma_0^2} > \chi_1^2(\alpha), \] (check that $z_{\alpha/2}^2 = \chi_1^2(\alpha)$). @@ -968,9 +968,9 @@ \subsection{Composite hypotheses} The next theorem allows us to use likelihood ratio tests even when we cannot find the exact relevant null distribution. -First consider the ''size`` or ''dimension`` of our hypotheses: suppose that $H_0$ imposes $p$ independent restrictions on $\Theta$. So for example, if $\Theta = \{\theta: \theta = (\theta_1, \cdots, \theta_k)\}$, and we have +First consider the ``size'' or ``dimension'' of our hypotheses: suppose that $H_0$ imposes $p$ independent restrictions on $\Theta$. So for example, if $\Theta = \{\theta: \theta = (\theta_1, \cdots, \theta_k)\}$, and we have \begin{itemize} - \item $H_0: \theta_{i_1} = a_1, \theta_{i_2} = a_2, \cdots \theta_{i_p} = a_p$; or + \item $H_0: \theta_{i_1} = a_1, \theta_{i_2} = a_2, \cdots , \theta_{i_p} = a_p$; or \item $H_0: A\theta = \mathbf{b}$ (with $A$ $p\times k$, $\mathbf{b}$ $p\times 1$ given); or \item $H_0: \theta_i = f_i(\varphi), i = 1, \cdots, k$ for some $\varphi = (\varphi_1, \cdots, \varphi_{k - p})$. \end{itemize} @@ -979,12 +979,12 @@ \subsection{Composite hypotheses} \begin{thm}[Generalized likelihood ratio theorem] Suppose $\Theta_0 \subseteq \Theta_1$ and $|\Theta_1| - |\Theta_0| = p$. Let $\mathbf{X} = (X_1, \cdots, X_n)$ with all $X_i$ iid. Then if $H_0$ is true, as $n\to \infty$, \[ - 2\log \Lambda_\mathbf{X}(H_0:H_1)\sim \chi_p^2. + 2\log \Lambda_\mathbf{X}(H_0;H_1)\sim \chi_p^2. \] If $H_0$ is not true, then $2\log \Lambda$ tends to be larger. We reject $H_0$ if $2\log \Lambda > c$, where $c = \chi_p^2(\alpha)$ for a test of approximately size $\alpha$. \end{thm} -For example, in our example above, $|\Theta_1| - |\Theta_0| = 1$, and in this case, we saw that under $H_0$, $2\log \Lambda \sim \chi_1^2$ \emph{exactly} for all $n$ in that particular case, rather than just approximately. +We will not prove this result here. In our example above, $|\Theta_1| - |\Theta_0| = 1$, and in this case, we saw that under $H_0$, $2\log \Lambda \sim \chi_1^2$ \emph{exactly} for all $n$ in that particular case, rather than just approximately. \subsection{Tests of goodness-of-fit and independence} \subsubsection{Goodness-of-fit of a fully-specified null distribution} @@ -1004,7 +1004,7 @@ \subsubsection{Goodness-of-fit of a fully-specified null distribution} \end{center} Is this compatible with a uniform distribution over the year? - Out of $n$ independent observations, let $N_i$ be the number of observations in $i$th set. So $(N_1,\cdots, N_k)\sim \multinomial(n; p_1, \cdots, p_n)$. + Out of $n$ independent observations, let $N_i$ be the number of observations in $i$th set. So $(N_1,\cdots, N_k)\sim \multinomial(k; p_1, \cdots, p_k)$. For a generalized likelihood ratio test of $H_0$, we need to find the maximised likelihood under $H_0$ and $H_1$. @@ -1016,11 +1016,11 @@ \subsubsection{Goodness-of-fit of a fully-specified null distribution} \] Here $|\Theta_1| - |\Theta_0| = k - 1$. So we reject $H_0$ if $2\log \Lambda > \chi_{k - 1}^2(\alpha)$ for an approximate size $\alpha$ test. - Under $H_0$ (no effect of month of birth), $\tilde{p}_i$ is the proportion of births in month $i$ in 1993/1994 --- this is \emph{not} simply proportional to the number of days in each month (or even worse, $\frac{1}{12}$), as there is for example an excess of September births (the ``Christmas effect''). So + Under $H_0$ (no effect of month of birth), $\tilde{p}_i$ is the proportion of births in month $i$ in 1993/1994 in the whole population --- this is \emph{not} simply proportional to the number of days in each month (or even worse, $\frac{1}{12}$), as there is for example an excess of September births (the ``Christmas effect''). So Then \[ - 2\log \Lambda = 2\sum n_i \log\left(\frac{n_i}{n\tilde{p}_i}\right) = 44.9. + 2\log \Lambda = 2\sum n_i \log\left(\frac{n_i}{n\tilde{p}_i}\right) = 44.86. \] $\P(\chi_{11}^2 > 44.86) = 3\times 10^{-9}$, which is our $p$-value. Since this is certainly less than 0.001, we can reject $H_0$ at the $0.1\%$ level, or can say the result is ``significant at the $0.1\%$ level''. @@ -1030,11 +1030,11 @@ \subsubsection{Goodness-of-fit of a fully-specified null distribution} Under $H_0$, we find mle $\hat{\theta}$ by maximizing $n_i \log p_i (\theta)$, and then \[ - 2\log \Lambda = 2\log \left(\frac{\hat{p_1}^{n_1}\cdots \hat{p_k}^{n_k}}{p_1(\hat{\theta})^{n_1}\cdots p_k (\hat{\theta})^{n_1}}\right) = 2\sum n_i \log \left(\frac{n_i}{np_i(\hat{\theta})}\right).\tag{2} + 2\log \Lambda = 2\log \left(\frac{\hat{p_1}^{n_1}\cdots \hat{p_k}^{n_k}}{p_1(\hat{\theta})^{n_1}\cdots p_k (\hat{\theta})^{n_k}}\right) = 2\sum n_i \log \left(\frac{n_i}{np_i(\hat{\theta})}\right).\tag{2} \] The degrees of freedom are $k - 1 - |\Theta_0|$. -\subsubsection{Pearson's Chi-squared test} +\subsubsection{Pearson's chi-squared test} Notice that the two log likelihoods are of the same form. In general, let $o_i = n_i$ (observed number) and let $e_i = n\tilde{p_i}$ or $np_i(\hat{\theta})$ (expected number). Let $\delta_i = o_i - e_i$. Then \begin{align*} 2\log \Lambda &= 2\sum o_i \log \left(\frac{o_i}{e_i}\right)\\ @@ -1045,7 +1045,7 @@ \subsubsection{Pearson's Chi-squared test} &\approx \sum \frac{\delta_i^2}{e_i}\\ &= \sum\frac{(o_i - e_i)^2}{e_i}. \end{align*} -This is known as the \emph{Pearson's Chi-squared test}. +This is known as the \emph{Pearson's chi-squared test}. \begin{eg} Mendel crossed 556 smooth yellow male peas with wrinkled green peas. From the progeny, let @@ -1065,7 +1065,7 @@ \subsubsection{Pearson's Chi-squared test} Here $|\Theta_0| = 0$ and $|\Theta_1| = 4 - 1 = 3$. So we refer to test statistics $\chi_3^2(\alpha)$. - Since $\chi_3^2(0.05) = 7.815$, we see that neither value is significant at $5\%$. So there is no evidence against Mendel's theory. In fact, the $p$-value is approximately $\P(\chi_3^2 > 0.6) \approx 0.96$. This is a \emph{really} good fit, so good that people suspect the numbers were not genuine. + Since $\chi_3^2(0.05) = 7.815$, we see that neither value is significant at $5\%$. So there is no evidence against Mendel's theory. In fact, the $p$-value is approximately $\P(\chi_3^2 > 0.6) \approx 0.90$. This is a \emph{really} good fit, so good that people suspect the numbers were not genuine. \end{eg} \begin{eg} @@ -1113,11 +1113,11 @@ \subsubsection{Testing independence in contingency tables} We have \[ - (N_{11}, \cdots, N_{1c}, N_{21}, \cdots, N_{rc}) \sim \multinomial (n; p_{11}, \cdots, p_{1c}, p_{21}, \cdots, p_{rc}). + (N_{11}, \cdots, N_{1c}, N_{21}, \cdots, N_{rc}) \sim \multinomial (rc; p_{11}, \cdots, p_{1c}, p_{21}, \cdots, p_{rc}). \] We may be interested in testing the null hypothesis that the two classifications are independent. So we test \begin{itemize} - \item $H_0$: $p_i = p_{i+}p_{+j}$ for all $i, j$, i.e.\ independence of columns and rows. + \item $H_0$: $p_{ij} = p_{i+}p_{+j}$ for all $i, j$, i.e.\ independence of columns and rows. \item $H_1$: $p_{ij}$ are unrestricted. \end{itemize} Of course we have the usual restrictions like $p_{++} = 1$, $p_{ij} \geq 0$. @@ -1203,9 +1203,9 @@ \subsubsection{Tests of homogeneity} \[ H_1: p_{ij}\text{ are unrestricted}. \] -Using $H_1$, +Using $H_1$, for some matrix of probabilities $(p_{ij})$, \[ - \like(p_{ij}) = \prod_{i = 1}^r \frac{n_{i+}!}{n_{i1}!\cdots n_{ic}!}p_{i1}^{n_{i1}} \cdots p_{ic}^{n_{ic}}, + \like((p_{ij})) = \prod_{i = 1}^r \frac{n_{i+}!}{n_{i1}!\cdots n_{ic}!}p_{i1}^{n_{i1}} \cdots p_{ic}^{n_{ic}}, \] and \[ @@ -1229,7 +1229,7 @@ \subsubsection{Tests of homogeneity} We reject $H_0$ if $2\log \Lambda > \chi_{(r - 1)(c - 1)}^2 (\alpha)$ for an approximate size $\alpha$ test. -If we let $o_{ij}= n_{ij}, e_{ij} = \frac{n_{i+}n_{+j}}{n_{++}}$, and $\delta_{ij} = o_{ij} - e_{ij}$, using the same approximating steps as for Pearson's Chi-squared, we obtain +If we let $o_{ij}= n_{ij}, e_{ij} = \frac{n_{i+}n_{+j}}{n_{++}}$, and $\delta_{ij} = o_{ij} - e_{ij}$, using the same approximating steps as for Pearson's chi-squared, we obtain \[ 2\log \Lambda \approx \sum \frac{(o_{ij} - e_{ij})^2}{e_{ij}}. \] @@ -1276,9 +1276,8 @@ \subsubsection{Confidence intervals and hypothesis tests} Note that when we say ``acceptance'', we really mean ``non-rejection''! The name is purely for historical reasons. \end{defi} -Suppose $X_1, \cdots, X_n$ have joint pdf $f_\mathbf{X}(\mathbf{x}\mid \theta)$ for $\theta\in \Theta$ - -\begin{thm}\leavevmode +\begin{thm}[Duality of hypothesis tests and confidence intervals]\leavevmode + Suppose $X_1, \cdots, X_n$ have joint pdf $f_\mathbf{X}(\mathbf{x}\mid \theta)$ for $\theta\in \Theta$. \begin{enumerate} \item Suppose that for every $\theta_0\in \Theta$ there is a size $\alpha$ test of $H_0: \theta = \theta_0$. Denote the acceptance region by $A(\theta_0)$. Then the set $I(\mathbf{X}) = \{\theta:\mathbf{X}\in A(\theta)\}$ is a $100(1 - \alpha)\%$ confidence set for $\theta$. \item Suppose $I(\mathbf{X})$ is a $100(1 - \alpha)\%$ confidence set for $\theta$. Then $A(\theta_0) = \{\mathbf{X}: \theta_0 \in I(\mathbf{X})\}$ is an acceptance region for a size $\alpha$ test of $H_0: \theta = \theta_0$. @@ -1292,7 +1291,8 @@ \subsubsection{Confidence intervals and hypothesis tests} For (i), since the test is size $\alpha$, we have \[ - \P(\text{accept }H_0\mid H_0\text{ is true}) = \P(\mathbf{X}\in A_(\theta_0)\mid \theta=\theta_0) = 1 - \alpha. + \P(\text{accept }H_0\mid H_0\text{ is true}) = \P(\mathbf{X}\in A + (\theta_0)\mid \theta=\theta_0) = 1 - \alpha. \] And so \[ @@ -1313,7 +1313,7 @@ \subsubsection{Confidence intervals and hypothesis tests} One way is to use the theorem and find the confidence set that belongs to the hypothesis test that we found in the previous example. We find a test of size 0.05 of $H_0 : \mu= \mu_0$ against $H_1: \mu\not= \mu_0$ that rejects $H_0$ when $|\sqrt{n}(\bar x - \mu_0)| > 1.96$ (where 1.96 is the upper $2.5\%$ point of $N(0, 1)$). - Then $I(\mathbf{X}) = \{\mu: \mathbf{X}\in A(\mu)\} = \{\mu:|\sqrt{n}(\bar X - \mu)| < 1.96\%\}$. So a $95\%$ confidence set for $\mu$ is $(\bar X - 1.96/\sqrt{n}, \bar X + 1.96/\sqrt{n})$. + Then $I(\mathbf{X}) = \{\mu: \mathbf{X}\in A(\mu)\} = \{\mu:|\sqrt{n}(\bar X - \mu)| < 1.96\}$. So a $95\%$ confidence set for $\mu$ is $(\bar X - 1.96/\sqrt{n}, \bar X + 1.96/\sqrt{n})$. \end{eg} \subsection{Multivariate normal theory} \subsubsection{Multivariate normal distribution} @@ -1525,7 +1525,7 @@ \subsection{Student's \texorpdfstring{$t$}{t}-distribution} Why would we define such a weird distribution? The typical application is to study random samples with unknown mean \emph{and} unknown variance. -Let $X_1, \cdots, X_n$ be iid $\N(\mu, \sigma^2)$. Then $\bar X \sim N(\mu, \sigma^2/n)$. So $Z = \frac{\sqrt{n}(\bar X - \mu)}{\sigma} \sim N(0, 1)$. +Let $X_1, \cdots, X_n$ be iid $N(\mu, \sigma^2)$. Then $\bar X \sim N(\mu, \sigma^2/n)$. So $Z = \frac{\sqrt{n}(\bar X - \mu)}{\sigma} \sim N(0, 1)$. Also, $S_{XX}/\sigma^2 \sim \chi^2_{n - 1}$ and is independent of $\bar X$, and hence $Z$. So \[ @@ -1690,7 +1690,7 @@ \subsection{Linear models} \[ -2 x_{ik}(Y_i - x_{ij}\hat{\beta}_j) = 0 \] -for each $k$ (with implicit summation over $i$ and $j$), i.e. +for each $k$ (with implicit summation over $i$ and $j$), i.e.\ \[ x_{ik}x_{ij}\hat{\beta_j} = x_{ik}Y_i \] @@ -1983,7 +1983,7 @@ \subsection{Linear models with normal assumptions} \[ \left.\frac{\partial l}{\partial \sigma^2}\right|_{\hat{\boldsymbol\beta}, \hat{\sigma}^2} = 0, \] -i.e. +i.e.\ \[ -\frac{n}{2\sigma^2} + \frac{S(\hat{\boldsymbol\beta})}{2 \hat{\sigma}^4} = 0 \] From 643b4d93b552fd0770547823d75875db0e6d9656 Mon Sep 17 00:00:00 2001 From: Arthur Date: Wed, 30 Dec 2020 21:32:26 +0000 Subject: [PATCH 11/14] fixed two things --- IB_L/statistics.tex | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/IB_L/statistics.tex b/IB_L/statistics.tex index 032285f..3fcec56 100644 --- a/IB_L/statistics.tex +++ b/IB_L/statistics.tex @@ -1203,7 +1203,7 @@ \subsubsection{Tests of homogeneity} \[ H_1: p_{ij}\text{ are unrestricted}. \] -Using $H_1$, for some matrix of probabilities $(p_{ij})$, +Using $H_1$, for any matrix of probabilities $(p_{ij})$, \[ \like((p_{ij})) = \prod_{i = 1}^r \frac{n_{i+}!}{n_{i1}!\cdots n_{ic}!}p_{i1}^{n_{i1}} \cdots p_{ic}^{n_{ic}}, \] @@ -1291,8 +1291,7 @@ \subsubsection{Confidence intervals and hypothesis tests} For (i), since the test is size $\alpha$, we have \[ - \P(\text{accept }H_0\mid H_0\text{ is true}) = \P(\mathbf{X}\in A - (\theta_0)\mid \theta=\theta_0) = 1 - \alpha. + \P(\text{accept }H_0\mid H_0\text{ is true}) = \P(\mathbf{X}\in A(\theta_0)\mid \theta=\theta_0) = 1 - \alpha. \] And so \[ From afda01bc6a1b24e5234994f53ee6aa6cf789cd27 Mon Sep 17 00:00:00 2001 From: Arthur Date: Sun, 16 May 2021 14:11:26 +0100 Subject: [PATCH 12/14] GRM typo --- IB_L/groups_rings_and_modules.tex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/IB_L/groups_rings_and_modules.tex b/IB_L/groups_rings_and_modules.tex index 58182fc..a17e276 100644 --- a/IB_L/groups_rings_and_modules.tex +++ b/IB_L/groups_rings_and_modules.tex @@ -4347,7 +4347,7 @@ \subsection{Conjugacy of matrices*} We can reinterpret this a little bit, using our classification of finitely-generated modules. \begin{cor} - There is a bijection between conjugacy classes of $n \times n$ matrices over $\F$ and sequences of monic polynomials $d_1, \cdots, d_r$ such that $d_1 \mid d_2 \mid \cdots \mid d_r$ and $\deg (d_1,\cdots, d_r) = n$. + There is a bijection between conjugacy classes of $n \times n$ matrices over $\F$ and sequences of monic polynomials $d_1, \cdots, d_r$ such that $d_1 \mid d_2 \mid \cdots \mid d_r$ and $\deg (d_1 + \cdots + d_r) = n$. \end{cor} \begin{eg} From bb8bda1f60dfc5513e92a2af1e1138436a786f6b Mon Sep 17 00:00:00 2001 From: ArthurConmy Date: Tue, 12 Oct 2021 22:55:28 +0100 Subject: [PATCH 13/14] fix algebraic topology typos --- II_M/algebraic_topology.tex | 63 +++++++++++++++++++++++-------------- 1 file changed, 40 insertions(+), 23 deletions(-) diff --git a/II_M/algebraic_topology.tex b/II_M/algebraic_topology.tex index 500c74a..043c51b 100644 --- a/II_M/algebraic_topology.tex +++ b/II_M/algebraic_topology.tex @@ -128,7 +128,7 @@ \subsection{Cell complexes} \] where the equivalence relation $\sim$ is the equivalence relation generated by $x\sim f(x)$ for all $x\in S^{n - 1}\subseteq D^n$ (and $\amalg$ is the disjoint union). - Intuitively, a map $f: S^{n - 1}\to X$ just picks out a subset $X$ that looks like the sphere. So we are just sticking a disk onto $X$ by attaching the boundary of the disk onto a sphere within $X$. + Intuitively, a map $f: S^{n - 1}\to X$ just picks out a subset of $X$ that looks like the sphere. So we are just sticking a disk onto $X$ by attaching the boundary of the disk onto a sphere within $X$. \begin{center} \begin{tikzpicture} \draw ellipse (0.5 and 1); @@ -701,7 +701,7 @@ \subsection{The fundamental group} Immediate from our previous lemmas. \end{proof} -Often, in mathematics, after defining a term, we give lots of examples of it. Unfortunately, it is rather difficult to prove that a space has a non-trivial fundamental group, until we have developed some relevant machinery. Hence we will have to wait for a while before we have some concrete examples. Instead, we will look at some properties of the fundamental group first. +Often in mathematics, after defining a term, we give lots of examples of it. Unfortunately, it is rather difficult to prove that a space has a non-trivial fundamental group, until we have developed some relevant machinery. Hence we will have to wait for a while before we have some concrete examples. Instead, we will look at some properties of the fundamental group first. \begin{defi}[Based space] A \emph{based space} is a pair $(X, x_0)$ of a space $X$ and a point $x_0\in X$, the \emph{basepoint}. A \emph{map of based spaces} @@ -1399,9 +1399,9 @@ \subsection{Covering space} \item The stabilizer of $\tilde{x}_0 \in p^{-1}(x_0)$ is $p_*(\pi_1(\tilde{X}, \tilde{x}_0)) \subseteq \pi_1(X, x_0)$. \item If $\tilde{X}$ is path connected, then there is a bijection \[ - p_* \pi_1(\tilde{X}, \tilde{x}_0)\backslash \pi_1(X, x_0) \to p^{-1}(x_0). + p_* (\pi_1(\tilde{X}, \tilde{x}_0))\backslash \pi_1(X, x_0) \to p^{-1}(x_0). \] - Note that $p_* \pi_1(\tilde{X}, \tilde{x}_0)\backslash \pi_1(X, x_0)$ is not a quotient, but simply the set of cosets. We write it the ``wrong way round'' because we have right cosets instead of left cosets. + Note that $p_*(\pi_1(\tilde{X}, \tilde{x}_0))\backslash \pi_1(X, x_0)$ is not a quotient, but simply the set of cosets. We write it the ``wrong way round'' because we have right cosets instead of left cosets. \end{enumerate} \end{lemma} Note that this is great! If we can find a covering space $p$ and a point $x_0$ such that $p^{-1}(x_0)$ is non-trivial, then we immediately know that $\pi_1(X, x_0)$ is non-trivial! @@ -1716,7 +1716,7 @@ \subsection{The Galois correspondence} Now we prove that every subgroup of $\pi_1$ comes from exactly one covering space. What this statement properly means is made precise in the following proposition: \begin{prop} - Let $(X, x_0)$, $(\tilde{X}_1, \tilde{x}_1)$, $(\tilde{X}_2, \tilde{X}_2)$ be path-connected based spaced, and $p_i: (\tilde{X}_i, \tilde{x}_i) \to (X, x_0)$ be covering maps. Then we have + Let $(X, x_0)$, $(\tilde{X}_1, \tilde{x}_1)$, $(\tilde{X}_2, \tilde{x}_2)$ be path-connected based spaced, and $p_i: (\tilde{X}_i, \tilde{x}_i) \to (X, x_0)$ be covering maps. Then we have \[ p_{1*}\pi_1(\tilde{X}_1, \tilde{x}_1) = p_{2*} \pi_1(\tilde{X}_2, \tilde{x_2}) \] @@ -2209,7 +2209,7 @@ \subsection{Seifert-van Kampen theorem} \[ \pi_1(S^n) \cong \pi_1(\R^n) \underset{\pi_1(S^{n - 1})}{*} \pi_1(\R^n) \cong 1 \underset{\pi_1(S^{n - 1})}{*} 1 \] - It is easy to see this is the trivial group. We can see this directly form the universal property of the amalgamated free product, or note that it is the quotient of $1 * 1$, which is $1$. +It is easy to see this is the trivial group. We can see this directly from the universal property of the amalgamated free product, or note that it is the quotient of $1 * 1$, which is $1$. So for $n \geq 2$, $\pi_1(S^n) \cong 1$. \end{eg} @@ -2428,7 +2428,7 @@ \subsection{The effect on \texorpdfstring{$\pi_1$}{pi1} of attaching cells} \end{proof} The more interesting case is when we have smaller dimensions. \begin{thm} - If $n = 2$, then the natural map $\pi_1(X, X_0) \to \pi_1(X\cup_f D^n, x_0)$ is \emph{surjective}, and the kernel is $\bra\bra [f] \ket \ket$. Note that this statement makes sense, since $S^{n - 1}$ is a circle, and $f: S^{n - 1} \to X$ is a loop in $X$. + If $n = 2$, then the natural map $\pi_1(X, x_0) \to \pi_1(X\cup_f D^n, x_0)$ is \emph{surjective}, and the kernel is $\bra\bra [f] \ket \ket$. Note that this statement makes sense, since $S^{n - 1}$ is a circle, and $f: S^{n - 1} \to X$ is a loop in $X$. \end{thm} This is what we would expect, since if we attach a disk onto the loop given by $f$, this loop just dies. @@ -2856,7 +2856,7 @@ \subsection{Simplicial complexes} The boundary of $\sigma$ is usually denoted by $\partial \sigma$, while the interior is denoted by $\mathring{\sigma}$, and we write $\tau \leq \sigma$ when $\tau$ is a face of $\sigma$. \end{defi} -In particular, the interior of a vertex is the vertex itself. Note that this notions of interior and boundary are distinct from the topological notions of interior and boundary. +In particular, the interior of a vertex is the vertex itself. Note that these notions of interior and boundary are distinct from the topological notions of interior and boundary. \begin{eg} The \emph{standard $n$-simplex} is spanned by the basis vectors $\{\mathbf{e}_0, \cdots, \mathbf{e}_n\}$ in $\R^{n + 1}$. For example, when $n = 2$, we get the following: @@ -3177,7 +3177,7 @@ \subsection{Simplicial approximation} \end{defi} \begin{prop} - $|K| = |K|'$ and $K'$ really is a simplicial complex. + $|K| = |K'|$ and $K'$ really is a simplicial complex. \end{prop} \begin{proof} @@ -3186,12 +3186,12 @@ \subsection{Simplicial approximation} We now have a slight problem. Even though $|K'|$ and $|K|$ are equal, the identity map from $|K'|$ to $|K|$ is not a simplicial map. -To solve this problem, we can choose any function $K \to V_K$ by $\sigma \mapsto v_\sigma$ with $v_\sigma \in \sigma$, i.e.\ a function that sends any simplex to any of its vertices. Then we can define $g: K' \to K$ by sending $\hat{\sigma} \mapsto v_\sigma$. Then this is a simplicial map, and indeed a simplicial approximation to the identity map $|K'| \to |K|$. +To solve this problem, we can choose any function $K \to V_K$ by $\sigma \mapsto v_\sigma$ with $v_\sigma \in \sigma$, i.e.\ a function that sends any simplex to any of its vertices. Then we can define $g: K' \to K$ by sending $\hat{\sigma} \mapsto v_\sigma$. Then this is a simplicial map, and indeed a simplicial approximation to the identity map $|K'| \to |K|$. We will revisit this idea later when we discuss homotopy invariance. The key theorem is that as long as we are willing to perform barycentric subdivisions, then we can always find a simplicial approximation. \begin{thm}[Simplicial approximation theorem] - Le $K$ and $L$ be simplicial complexes, and $f: |K| \to |L|$ a continuous map. Then there exists an $r$ and a simplicial map $g: K^{(r)} \to L$ such that $g$ is a simplicial approximation of $f$. Furthermore, if $f$ is already simplicial on $M\subseteq K$, then we can choose $g$ such that $|g||_M = f|_M$. + Let $K$ and $L$ be simplicial complexes, and $f: |K| \to |L|$ a continuous map. Then there exists an $r$ and a simplicial map $g: K^{(r)} \to L$ such that $g$ is a simplicial approximation of $f$. Furthermore, if $f$ is already simplicial on $M\subseteq K$, then we can choose $g$ such that $|g||_M = f|_M$. \end{thm} The first thing we have to figure out is how far we are going to subdivide. To do this, we want to quantify how ``fine'' our subdivisions are. @@ -3204,9 +3204,9 @@ \subsection{Simplicial approximation} We have the following lemma that tells us how large our mesh is: \begin{lemma} - Let $\dim K \leq n$, then + Let $\dim K = n$, then \[ - \mu(K^{(r)}) = \left(\frac{n}{n + 1}\right)^r \mu(K). + \mu(K^{(r)}) \leq \left(\frac{n}{n + 1}\right)^r \mu(K). \] \end{lemma} % insert proof The key point is that as $r \to \infty$, the mesh goes to zero. So indeed we can make our barycentric subdivisions finer and finer. The proof is purely technical and omitted. @@ -3647,7 +3647,7 @@ \subsection{Some homological algebra} & & D_n & D_{n + 1} \ar [l, "d_{n + 1}"] \end{tikzcd} \] -The intuition behind this definition is as follows: suppose $C_{\Cdot} = C_{\Cdot} (K)$ and $D_{\Cdot} = C_{\Cdot}(L)$ for $K, L$ simplicial complexes, and $f_{\Cdot}$ and $g_{\Cdot}$ are ``induced'' by simplicial maps $f, g: K \to L$. How can we detect if $f$ and $g$ are homotopic via the homotopy groups? +The intuition behind this definition is as follows: suppose $C_{\Cdot} = C_{\Cdot} (K)$ and $D_{\Cdot} = C_{\Cdot}(L)$ for $K, L$ simplicial complexes, and $f_{\Cdot}$ and $g_{\Cdot}$ are ``induced'' by simplicial maps $f, g: K \to L$, where by ``induced'' we mean where $n$-simplices are mapped to $k$-simplices for $k 0$ such that if $f, g: |K| \to |L|$ satisfy $\|f(x) - g(x)\| < \varepsilon$, then $f_* = g_*: H_n(K) \to H_n(L)$ for all $n$. \end{lemma} -The idea of the proof is that if $\|f(x) - g(x)\|$ is small enough, we can barycentrically subdivide $L$ such that we get a simplicial approximation to both $f$ and $g$. +The idea of the proof is that if $\|f(x) - g(x)\|$ is small enough, we can barycentrically subdivide $K$ such that we get a simplicial approximation to both $f$ and $g$. \begin{proof} By the Lebesgue number lemma, there is an $\varepsilon > 0$ such that each ball of radius $2\varepsilon$ in $|L|$ lies in some star $\St_L(w)$. @@ -4326,9 +4330,9 @@ \subsection{Continuous maps and homotopy invariance} \[ g(B_\delta(x)) \subseteq B_{2\varepsilon}(y) \subseteq \St_L(w). \] - Now subdivide $r$ times so that $\mu(K^{(r)}) < \frac{1}{2} \delta$. So for all $v \in V_K(R)$, we know + Now subdivide $r$ times so that $\mu(K^{(r)}) < \frac{1}{2} \delta$. So for all $v \in V_{K^(r)}$, we know \[ - \St_{K(r)} (v) \subseteq B_\delta(V). + \St_{K^{(r)}} (v) \subseteq B_\delta(v). \] This gets mapped by \emph{both} $f$ and $g$ to $\St_L(w)$ for the same $w \in V_L$. We define $s: V_{K^{(r)}} \to V_L$ sending $v \mapsto w$. \end{proof} @@ -4443,7 +4447,7 @@ \subsection{Homology of spheres and applications} \node at (-1.6, -1.2) [anchor = north east] {$g(x)$}; \end{tikzpicture} \end{center} - So we now show no such continuous retraction can exist. Suppose $r: D^n \to \partial D^n$ is a retraction, i.e.\ $r \circ i \simeq \id: \partial D^n \to D^n$. + So we now show no such continuous retraction can exist. Suppose $r: D^n \to \partial D^n$ is a retraction, i.e.\ $r \circ i \simeq \id: \partial D^n \to \partial D^n$. \[ \begin{tikzcd} S^{n - 1} \ar[r, "i"] & D^n \ar[r, "r"] & S^{n - 1} @@ -4772,7 +4776,19 @@ \subsection{Rational homology, Euler and Lefschetz numbers} \] In this case, we have not lost any information because there was no torsion part of the homology groups. - However, for the non-orientable surfaces, we have + However, for the non-orientable surfaces, since + + \[ + H_k(E_n) = + \begin{cases} + \Z & k = 0\\ + \Z^{n - 1} \times \Z / 2 & k = 1\\ + 0 & \text{otherwise} + \end{cases}, + \] + + (exercise) we have that + \[ H_k(E_n, \Q) = \begin{cases} @@ -4781,6 +4797,7 @@ \subsection{Rational homology, Euler and Lefschetz numbers} 0 & \text{otherwise} \end{cases}, \] + This time, this is different from the integral coefficient case, where we have an extra $\Z_2$ term in $H_1$. \end{eg} @@ -4866,13 +4883,13 @@ \subsection{Rational homology, Euler and Lefschetz numbers} There is an exact sequence \[ \begin{tikzcd} - 0 \ar[r] & B_i(K; Q) \ar[r] & Z_i(K; \Q) \ar[r] & H_i(K; \Q) \ar[r] & 0 + 0 \ar[r] & B_i(K; \Q) \ar[r] & Z_i(K; \Q) \ar[r] & H_i(K; \Q) \ar[r] & 0 \end{tikzcd} \] This is since $H_i(K, \Q)$ is defined as the quotient of $Z_i$ over $B_i$. We also have the exact sequence \[ \begin{tikzcd} - 0 \ar[r] & Z_i(K; Q) \ar[r] & C_i(K; \Q) \ar[r, "d_i"] & B_{i - 1}(K; \Q) \ar[r] & 0 + 0 \ar[r] & Z_i(K; \Q) \ar[r] & C_i(K; \Q) \ar[r, "d_i"] & B_{i - 1}(K; \Q) \ar[r] & 0 \end{tikzcd} \] This is true by definition of $B_{i - 1}$ and $Z_i$. Let $f_i^H, f_i^B, f_i^Z, f_i^C$ be the various maps induced by $f$ on the corresponding groups. Then we have @@ -4937,7 +4954,7 @@ \subsection{Rational homology, Euler and Lefschetz numbers} \end{eg} \begin{eg} - Suppose $G$ is a path-connected topological group, i.e.\ $X$ is a group and a topological space, and inverse and multiplication are continuous maps. + Suppose $G$ is a path-connected topological group, i.e.\ $G$ is a group and a topological space, and inverse and multiplication are continuous maps. If $g \not= 1$, then the map \begin{align*} From b828577f38cca29084081e715c6bf11f638d6779 Mon Sep 17 00:00:00 2001 From: ArthurConmy Date: Sun, 24 Oct 2021 18:09:20 +0100 Subject: [PATCH 14/14] fix a typo and a diagram --- II_M/probability_and_measure.tex | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/II_M/probability_and_measure.tex b/II_M/probability_and_measure.tex index aefdd88..a9dcdc9 100644 --- a/II_M/probability_and_measure.tex +++ b/II_M/probability_and_measure.tex @@ -703,7 +703,7 @@ \subsection{Probability measures} \liminf A_n &= \bigcup_n \bigcap_{m \geq n} A_m. \end{align*} \end{defi} -To parse these definitions more easily, we can read $\cap$ as ``for all'', and $\cup$ as ``there exits''. For example, we can write +To parse these definitions more easily, we can read $\cap$ as ``for all'', and $\cup$ as ``there exist''. For example, we can write \begin{align*} \limsup A_n &= \forall n,\exists m \geq n\text{ such that }A_m\text{ occurs}\\ &= \{x: \forall n, \exists m \geq n, x \in A_m\}\\ @@ -1030,9 +1030,9 @@ \subsection{Constructing new measures} \draw [->] (0, 0) -- (4, 0); \draw [->] (0, 0) -- (0, 4); - \draw [thick, mblue] (0, 0) -- (1.5, 2) -- (2.5, 2) node [draw, fill=white, circle, inner sep = 0, minimum size = 3] {}; + \draw [thick, mblue] (0, 0) -- (1.5, 2) -- (2.5, 2) node [circ] {}; - \draw [thick, mblue] (2.5, 3) node [circ] {} -- (4, 4); + \draw [thick, mblue] (2.5, 3) node [draw, fill=white, circle, inner sep = 0, minimum size = 3] {} -- (4, 4); \end{tikzpicture} \end{center} \end{eg}