From 3eb0f229fdfb303f781ea3fd1b42f9d55ecbb24b Mon Sep 17 00:00:00 2001
From: eneller <erikneller@gmx.de>
Date: Thu, 30 Oct 2025 13:20:34 +0100
Subject: [PATCH] bib

---
 .gitignore  |  1 +
 entropy.bib | 10 +++----
 entropy.tex | 77 +++++++++++++++++++++++++++++++----------------------
 3 files changed, 51 insertions(+), 37 deletions(-)

diff --git a/.gitignore b/.gitignore
index 5ec2b2b..0f3f690 100644
--- a/.gitignore
+++ b/.gitignore
@@ -332,3 +332,4 @@ TSWLatexianTemp*
 
 # End of https://www.toptal.com/developers/gitignore/api/latex,visualstudiocode
 bib
+*.*-SAVE-ERROR
diff --git a/entropy.bib b/entropy.bib
index e73b5b9..b0dd565 100644
--- a/entropy.bib
+++ b/entropy.bib
@@ -2,34 +2,34 @@
     author = "{Wikipedia contributors}",
     title = "Shannon–Hartley theorem --- {Wikipedia}{,} The Free Encyclopedia",
     year = "2025",
-    howpublished = "\url{https://en.wikipedia.org/w/index.php?title=Shannon%E2%80%93Hartley_theorem&oldid=1316080633}",
+    url = "https://en.wikipedia.org/w/index.php?title=Shannon%E2%80%93Hartley_theorem&oldid=1316080633",
     note = "[Online; accessed 29-October-2025]"
   }
   @misc{ enwiki:noisy-channel,
     author = "{Wikipedia contributors}",
     title = "Noisy-channel coding theorem --- {Wikipedia}{,} The Free Encyclopedia",
     year = "2025",
-    howpublished = "\url{https://en.wikipedia.org/w/index.php?title=Noisy-channel_coding_theorem&oldid=1285893870}",
+    url = "https://en.wikipedia.org/w/index.php?title=Noisy-channel_coding_theorem&oldid=1285893870",
     note = "[Online; accessed 29-October-2025]"
   }
   @misc{ enwiki:source-coding,
     author = "{Wikipedia contributors}",
     title = "Shannon's source coding theorem --- {Wikipedia}{,} The Free Encyclopedia",
     year = "2025",
-    howpublished = "\url{https://en.wikipedia.org/w/index.php?title=Shannon%27s_source_coding_theorem&oldid=1301398440}",
+    url = "https://en.wikipedia.org/w/index.php?title=Shannon%27s_source_coding_theorem&oldid=1301398440",
     note = "[Online; accessed 29-October-2025]"
   }
  @misc{ dewiki:nyquist-shannon,
    author = "Wikipedia",
    title = "Nyquist-Shannon-Abtasttheorem --- Wikipedia{,} die freie Enzyklopädie",
    year = "2025",
-   url = "\url{https://de.wikipedia.org/w/index.php?title=Nyquist-Shannon-Abtasttheorem&oldid=255540066}",
+   url = "https://de.wikipedia.org/w/index.php?title=Nyquist-Shannon-Abtasttheorem&oldid=255540066",
    note = "[Online; Stand 29. Oktober 2025]"
  }
   @misc{ enwiki:information-content,
     author = "{Wikipedia contributors}",
     title = "Information content --- {Wikipedia}{,} The Free Encyclopedia",
     year = "2025",
-    howpublished = "\url{https://en.wikipedia.org/w/index.php?title=Information_content&oldid=1313862600}",
+    url = "https://en.wikipedia.org/w/index.php?title=Information_content&oldid=1313862600",
     note = "[Online; accessed 29-October-2025]"
   }
diff --git a/entropy.tex b/entropy.tex
index 5908164..e990fff 100644
--- a/entropy.tex
+++ b/entropy.tex
@@ -5,6 +5,7 @@
 \usepackage{subcaption}
 \usepackage{parskip} % dont indent after paragraphs, figures
 \usepackage{xcolor}
+%\usepackage{csquotes} % Recommended for biblatex
 \usepackage{tikz}
 \usepackage{float}
 \usepackage{amsmath}
@@ -58,9 +59,9 @@ As a measure, Shannon's formula uses the \textit{Bit}, quantifying the efficienc
 and media for transmission and storage.
 According to his axioms, a measure for information has to comply with the following criteria:
 \begin{enumerate}
+    \item $I(1) = 0$: events that always occur do not communicate information.
     \item $I(p)$ is monotonically decreasing in p: an increase in the probability of an event
         decreases the information from an observed event, and vice versa.
-    \item $I(1) = 0$: events that always occur do not communicate information.
     \item $I(p_1 \cdot p_2) = I(p_1) + I(p_2)$: the information learned from independent events
         is the sum of the information learned from each event.
     \item $I(p)$ is a twice continuously differentiable function of p.
@@ -69,7 +70,7 @@ In information theory, entropy can be understood as the expected information of
 \begin{equation}
     H = E(I) = - \sum_i p_i \log_2(p_i)
 \end{equation}
-This leaves $ I =log(1/p) = - log_2(p_i)$, implying that an unexpected message (low probability) carries
+This leaves $ I =log(1/p_i) = - log_2(p_i)$, implying that an unexpected message (low probability) carries
 more information than one with higher probability.
 Intuitively, we can imagine David A. Johnston, a volcanologist reporting day after day that there is no
 activity on Mount St. Helens. After a while, we grow to expect this message because it is statistically very likely
@@ -92,7 +93,7 @@ leaving the required Bits per event source at $\log_2(a^k) \div k = \log_2(a)$.
 %- Quellentropie
 \section{Applications}
 \subsection{Decision Trees}
-A decision tree is a supervised learning approach commonly used in data mining.
+A decision tree is a supervised learning approach commonly used in machine learning.
 The goal is to create an algorithm, i.e a series of questions to pose to new data (input variables)
 in order to predict the target variable, a class label.
 Graphically, each question can be visualized as a node in a tree, splitting the dataset into two or more groups.
@@ -100,15 +101,11 @@ This process is applied to the source set and then its resulting sets in a proce
 Once a leaf is reached, the class of the input has been successfully determined.
 
 In order to build the shallowest possible trees, we want to use input variables that minimize uncertainty.
-While other measures for the best choice such as the Gini coefficient exist,
+While other measures for the best choice such as the \textit{Gini coefficient} exist,
 entropy is a popular measure used in decision trees.
 
-Using the previous learnings about information and entropy, we of course want to ask questions that have the
-highest information content, so pertaining input variables with the highest entropy.
-
-Zu berechnen ist also die Entropie des Klassifikators abzüglich der erwarteten Entropie
-nach der Aufteilung anhand des Attributs, 
-erklärt in~\autoref{ex:decisiontree}.
+Using what we learned about entropy, we want the maximum decrease in entropy of our target variable,
+as explained in~\autoref{ex:decisiontree}.
 \begin{figure}[H]
 \centering
 \begin{minipage}{.3\textwidth}
@@ -121,25 +118,23 @@ erklärt in~\autoref{ex:decisiontree}.
 \end{tabular}
 \end{minipage}
 \begin{minipage}{.6\textwidth}
-Die Entropie für Regen ist also $H_{prior} = H(\frac{9}{14},\frac{5}{14})$, 
-nach der Aufteilung anhand des Attributs Temperatur beträgt sie $H_{warm}= H(\frac{4}{7}, \frac{3}{7})$
-und $H_{kalt}= H(\frac{5}{7}, \frac{2}{7})$.
-Also ist die erwartete (Erwartungswert) Entropie nach der Aufteilung anhand der Temperatur $p_{warm} * H_{warm} + p_{kalt} * H_{kalt} $ .
-Der \textbf{Informationsgewinn} berechnet sich dann als Entropie für Regen abzüglich des Erwartungswertes der Entropie.
-Da $H_{prior}$ in dieser Berechnung jedoch konstant ist, kann auch einfach $E[H]$ nach der Aufteilung minimiert werden.
+When choosing rain as a target variable, the entropy prior to partitioning is $H_{prior} = H(\frac{9}{14},\frac{5}{14})$, 
+after partitioning by temperature (hot/cold)$H_{hot}= H(\frac{4}{7}, \frac{3}{7})$
+and $H_{cold}= H(\frac{5}{7}, \frac{2}{7})$ remain.
+This leaves us with an expected entropy of
+$p_{hot} * H_{hot} + p_{cold} * H_{cold} $ .
+The \textbf{information gain} can then be calculated as the difference of entropy proor and post partitioning.
+Since $H_{prior}$ is constant in this equation, it is sufficient to minimize post-partitioning $E[H]$.
 \end{minipage}
-\caption{Beispiel Informationsgewinn für Aufbau eines Entscheidungsbaums}
+\caption{Example of information gain in decision trees}
 \label{ex:decisiontree}
 \end{figure}
 
+Advantages of decision trees over other machine learning approaches include low computation cost and
+interpretability, making it a popular choice for many applications.
+However, drawbacks include overfitting and poor robustness, where minimal alterations to training data
+can lead to a change in tree structure.
 
-Vorteil: Niedriger Rechenaufwand und nachvollziehbarer Aufbau 
-Nachteil: Overfitting, geringe Robustheit: bereits kleine Änderungen der Trainingsdaten können zu einer Veränderung des Baums führen.
-
-
-Kann zu einem \textbf{Random Forest} (bagging) erweitert werden um die Robustheit zu steigern und Overfitting zu verringern.
-Resultiert jedoch in höherem Rechenaufwand und geringerer Interpretierbarkeit.
-Die Trainingsdaten können dafür zufällig gruppiert werden und die Bäume für eine Mehrheitsentscheidung genutzt werden.
 \subsection{Cross-Entropy}
 Kullback-Leibler = $H(p,q) - H(p)$
 as a cost function in machine learning
@@ -151,8 +146,24 @@ as a cost function in machine learning
 \subsection{Noisy communication channels}
 The noisy channel coding theorem was stated by \textit{Claude Shannon} in 1948, but first rigorous proof was
 provided in 1954 by Amiel Feinstein.
-It is of foundational to information theory, stating that given a noisy channel with capacity $C$
-and information transmitted at $R$ \cite{enwiki:shannon-hartley}
+One of the important issues Shannon wanted to tackle with his 'Mathematical theory of commmunication'
+was the insufficient means of transporting discrete data through a noisy channel that were more efficient than
+the telegram.
+The means of error correction until then had been limited to very basic means.
+
+First, analogue connections like the first telephone lines, bypassed the issue altogether and relied
+on the communicating parties and their brains' ability to filter human voices from the noise that was inevitably transmitted
+along with the intended signal.
+After some development, the telegraph in its final form used morse code, a series of long and short clicks, that,
+together with letter and word gaps, would encode text messages.
+Even though the long-short coding might appear similar to todays binary coding, the means of error correction were lacking.
+For a long time, it relied on simply repeating the message multiple times, which is highly inefficient.
+The destination would then have to determine the most likely intended message by performing a majority vote.
+One might also propose simply increasing transmitting power, thereby decreasing the error rate of the associated channel.
+However, the noisy channel coding theorem provides us with a more elegant solution.
+It is of foundational importance to information theory, stating that given a noisy channel with capacity $C$
+and information transmitted at rate $R$, there exists an $R<C$ so the error rate at the receiver can be
+arbitrarily small.
 
 \begin{figure}[H]
 \begin{tikzpicture}
@@ -183,20 +194,20 @@ and information transmitted at $R$ \cite{enwiki:shannon-hartley}
     \def\boxw{2.5cm}
     \def\n{4}
     \pgfmathsetmacro{\gap}{(\textwidth - \n*\boxw)/(\n-1)}
-    \node (S) at (0,0) [draw, align=center, text width=\boxw] {Information Source};
+    \node (S) at (0,0) [draw, align=center, text width=\boxw] {Transmitter};
     \node (S0) at (\boxw + \gap,1) [draw, circle] {0};
     \node (S1) at (\boxw + \gap,-1) [draw, circle] {1};
     \node (D0) at ({2*(\boxw + \gap)},1) [draw, circle] {0};
     \node (D1) at ({2*(\boxw + \gap)},-1) [draw, circle] {1};
-    \node (D) at ({3*(\boxw + \gap)},0) [draw, align=center, text width=\boxw] {Destination};
+    \node (D) at ({3*(\boxw + \gap)},0) [draw, align=center, text width=\boxw] {Receiver};
 
     \draw[->] (S) -- (S0);
     \draw[->] (S) -- (S1);
 
-    \draw[->,dashed] (S0) -- (D0) node[midway, above] {$p$};
-    \draw[->,dashed] (S0) -- (D1) node[pos=0.8, above] {$1-p$};
-    \draw[->,dashed] (S1) -- (D0) node[pos= 0.2, above] {$1-p$};
-    \draw[->,dashed] (S1) -- (D1) node[midway, below] {$p$};
+    \draw[->,dashed] (S0) -- (D0) node[midway, above] {$1-p$};
+    \draw[->,dashed] (S0) -- (D1) node[pos=0.8, above] {$p$};
+    \draw[->,dashed] (S1) -- (D0) node[pos= 0.2, above] {$p$};
+    \draw[->,dashed] (S1) -- (D1) node[midway, below] {$1-p$};
 
     \draw[->] (D0) -- (D);
     \draw[->] (D1) -- (D);
@@ -205,4 +216,6 @@ and information transmitted at $R$ \cite{enwiki:shannon-hartley}
 \label{fig:binary-channel}
 \end{figure}
 
+\printbibliography
+
 \end{document}
\ No newline at end of file