Step-by-step_intro_to_ML_with_SVC_and_Iris.tex

\documentclass [oneside,10pt,a4paper,ngerman,BCOR10mm,headsepline,parindent,final]{scrartcl}

    \usepackage[breakable]{tcolorbox}
    \usepackage{parskip} % Stop auto-indenting (to mimic markdown behaviour)
    

    % Basic figure setup, for now with no caption control since it's done
    % automatically by Pandoc (which extracts ![](path) syntax from Markdown).
    \usepackage{graphicx}
    % Maintain compatibility with old templates. Remove in nbconvert 6.0
    % \let\Oldincludegraphics\includegraphics
    % Ensure that by default, figures have no caption (until we provide a
    % proper Figure object with a Caption API and a way to capture that
    % in the conversion process - todo).
    % \usepackage{caption}
    % \DeclareCaptionFormat{nocaption}{}
    % \captionsetup{format=nocaption,aboveskip=0pt,belowskip=0pt}

    \usepackage{float}
    \floatplacement{figure}{H} % forces figures to be placed at the correct location
    \usepackage{xcolor} % Allow colors to be defined
    \usepackage{enumerate} % Needed for markdown enumerations to work
    \usepackage{geometry} % Used to adjust the document margins
    \usepackage{amsmath} % Equations
    \usepackage{amssymb} % Equations
    \usepackage{textcomp} % defines textquotesingle
    % Hack from http://tex.stackexchange.com/a/47451/13684:
    \AtBeginDocument{%
        \def\PYZsq{\textquotesingle}% Upright quotes in Pygmentized code
    }
    \usepackage{upquote} % Upright quotes for verbatim code
    \usepackage{eurosym} % defines \euro

    \usepackage{iftex}
    \ifPDFTeX
        \usepackage[utf8]{inputenc}
        \usepackage[T1]{fontenc}
        % Without the 'lmodern' package, 'pdflatex' substitutes the Type 1 fonts 
        % against the bitmap based Type 3 fonts and you get a very pixelated typeface.
        \usepackage{lmodern}
    \else
        \usepackage{fontspec}
        \usepackage{unicode-math}
    \fi

    \usepackage{fancyvrb} % verbatim replacement that allows latex
    \usepackage{grffile} % extends the file name processing of package graphics 
                         % to support a larger range
    \makeatletter % fix for old versions of grffile with XeLaTeX
    \@ifpackagelater{grffile}{2019/11/01}
    {
      % Do nothing on new versions
    }
    {
      \def\Gread@@xetex#1{%
        \IfFileExists{"\Gin@base".bb}%
        {\Gread@eps{\Gin@base.bb}}%
        {\Gread@@xetex@aux#1}%
      }
    }
    \makeatother
    \usepackage[Export]{adjustbox} % Used to constrain images to a maximum size
    \adjustboxset{max size={0.9\linewidth}{0.9\paperheight}}

    % The hyperref package gives us a pdf with properly built
    % internal navigation ('pdf bookmarks' for the table of contents,
    % internal cross-reference links, web links for URLs, etc.)
    \usepackage{hyperref}
    % The default LaTeX title has an obnoxious amount of whitespace. By default,
    % titling removes some of it. It also provides customization options.
    \usepackage{titling}
    \usepackage{longtable} % longtable support required by pandoc >1.10
    \usepackage{booktabs}  % table support for pandoc > 1.12.2
    \usepackage{array}     % table support for pandoc >= 2.11.3
    \usepackage{calc}      % table minipage width calculation for pandoc >= 2.11.1
    \usepackage[inline]{enumitem} % IRkernel/repr support (it uses the enumerate* environment)
    \usepackage[normalem]{ulem} % ulem is needed to support strikethroughs (\sout)
                                % normalem makes italics be italics, not underlines
    \usepackage{mathrsfs}
    
    % Using fancy headers and footers
    \usepackage{fancyhdr}
    
    % Used for entering author names and their affiliations
    \usepackage[affil-it]{authblk}
    
    % Use bibliography% and configure it
    \usepackage[babel,german=quotes]{csquotes}
    \usepackage[backend=biber,style=authoryear,backref=true]{biblatex}
    \bibliography{literature/notebook.bib}
    \usepackage{url}                    %Output of nicely formatted Internet addresses
    \setcounter{biburllcpenalty}{7000}  %Setting for counter to wrap URLs in literature references
    \setcounter{biburlucpenalty}{8000}  %ditto


    % Colors for the hyperref package
    \definecolor{urlcolor}{rgb}{0,.145,.698}
    \definecolor{linkcolor}{rgb}{.71,0.21,0.01}
    \definecolor{citecolor}{rgb}{.12,.54,.11}

    % ANSI colors
    \definecolor{ansi-black}{HTML}{3E424D}
    \definecolor{ansi-black-intense}{HTML}{282C36}
    \definecolor{ansi-red}{HTML}{E75C58}
    \definecolor{ansi-red-intense}{HTML}{B22B31}
    \definecolor{ansi-green}{HTML}{00A250}
    \definecolor{ansi-green-intense}{HTML}{007427}
    \definecolor{ansi-yellow}{HTML}{DDB62B}
    \definecolor{ansi-yellow-intense}{HTML}{B27D12}
    \definecolor{ansi-blue}{HTML}{208FFB}
    \definecolor{ansi-blue-intense}{HTML}{0065CA}
    \definecolor{ansi-magenta}{HTML}{D160C4}
    \definecolor{ansi-magenta-intense}{HTML}{A03196}
    \definecolor{ansi-cyan}{HTML}{60C6C8}
    \definecolor{ansi-cyan-intense}{HTML}{258F8F}
    \definecolor{ansi-white}{HTML}{C5C1B4}
    \definecolor{ansi-white-intense}{HTML}{A1A6B2}
    \definecolor{ansi-default-inverse-fg}{HTML}{FFFFFF}
    \definecolor{ansi-default-inverse-bg}{HTML}{000000}

    % common color for the border for error outputs.
    \definecolor{outerrorbackground}{HTML}{FFDFDF}

    % commands and environments needed by pandoc snippets
    % extracted from the output of `pandoc -s`
    \providecommand{\tightlist}{%
      \setlength{\itemsep}{0pt}\setlength{\parskip}{0pt}}
    \DefineVerbatimEnvironment{Highlighting}{Verbatim}{commandchars=\\\{\}}
    % Add ',fontsize=\small' for more characters per line
    \newenvironment{Shaded}{}{}
    \newcommand{\KeywordTok}[1]{\textcolor[rgb]{0.00,0.44,0.13}{\textbf{{#1}}}}
    \newcommand{\DataTypeTok}[1]{\textcolor[rgb]{0.56,0.13,0.00}{{#1}}}
    \newcommand{\DecValTok}[1]{\textcolor[rgb]{0.25,0.63,0.44}{{#1}}}
    \newcommand{\BaseNTok}[1]{\textcolor[rgb]{0.25,0.63,0.44}{{#1}}}
    \newcommand{\FloatTok}[1]{\textcolor[rgb]{0.25,0.63,0.44}{{#1}}}
    \newcommand{\CharTok}[1]{\textcolor[rgb]{0.25,0.44,0.63}{{#1}}}
    \newcommand{\StringTok}[1]{\textcolor[rgb]{0.25,0.44,0.63}{{#1}}}
    \newcommand{\CommentTok}[1]{\textcolor[rgb]{0.38,0.63,0.69}{\textit{{#1}}}}
    \newcommand{\OtherTok}[1]{\textcolor[rgb]{0.00,0.44,0.13}{{#1}}}
    \newcommand{\AlertTok}[1]{\textcolor[rgb]{1.00,0.00,0.00}{\textbf{{#1}}}}
    \newcommand{\FunctionTok}[1]{\textcolor[rgb]{0.02,0.16,0.49}{{#1}}}
    \newcommand{\RegionMarkerTok}[1]{{#1}}
    \newcommand{\ErrorTok}[1]{\textcolor[rgb]{1.00,0.00,0.00}{\textbf{{#1}}}}
    \newcommand{\NormalTok}[1]{{#1}}
    
    % Additional commands for more recent versions of Pandoc
    \newcommand{\ConstantTok}[1]{\textcolor[rgb]{0.53,0.00,0.00}{{#1}}}
    \newcommand{\SpecialCharTok}[1]{\textcolor[rgb]{0.25,0.44,0.63}{{#1}}}
    \newcommand{\VerbatimStringTok}[1]{\textcolor[rgb]{0.25,0.44,0.63}{{#1}}}
    \newcommand{\SpecialStringTok}[1]{\textcolor[rgb]{0.73,0.40,0.53}{{#1}}}
    \newcommand{\ImportTok}[1]{{#1}}
    \newcommand{\DocumentationTok}[1]{\textcolor[rgb]{0.73,0.13,0.13}{\textit{{#1}}}}
    \newcommand{\AnnotationTok}[1]{\textcolor[rgb]{0.38,0.63,0.69}{\textbf{\textit{{#1}}}}}
    \newcommand{\CommentVarTok}[1]{\textcolor[rgb]{0.38,0.63,0.69}{\textbf{\textit{{#1}}}}}
    \newcommand{\VariableTok}[1]{\textcolor[rgb]{0.10,0.09,0.49}{{#1}}}
    \newcommand{\ControlFlowTok}[1]{\textcolor[rgb]{0.00,0.44,0.13}{\textbf{{#1}}}}
    \newcommand{\OperatorTok}[1]{\textcolor[rgb]{0.40,0.40,0.40}{{#1}}}
    \newcommand{\BuiltInTok}[1]{{#1}}
    \newcommand{\ExtensionTok}[1]{{#1}}
    \newcommand{\PreprocessorTok}[1]{\textcolor[rgb]{0.74,0.48,0.00}{{#1}}}
    \newcommand{\AttributeTok}[1]{\textcolor[rgb]{0.49,0.56,0.16}{{#1}}}
    \newcommand{\InformationTok}[1]{\textcolor[rgb]{0.38,0.63,0.69}{\textbf{\textit{{#1}}}}}
    \newcommand{\WarningTok}[1]{\textcolor[rgb]{0.38,0.63,0.69}{\textbf{\textit{{#1}}}}}
    
    
    % Define a nice break command that doesn't care if a line doesn't already
    % exist.
    \def\br{\hspace*{\fill} \\* }
    % Math Jax compatibility definitions
    \def\gt{>}
    \def\lt{<}
    \let\Oldtex\TeX
    \let\Oldlatex\LaTeX
    \renewcommand{\TeX}{\textrm{\Oldtex}}
    \renewcommand{\LaTeX}{\textrm{\Oldlatex}}
    % Document parameters
    % Document title
    \title{\textbf{\textsf{Getting started with Machine Learning (ML) and Support Vector Classifiers (SVC) - A systematic step-by-step approach}}}\author{Dipl.-Ing. Bj\"orn Kasper (\href{mailto:kasper.bjoern@bgetem.de}{kasper.bjoern@bgetem.de})}
\affil{Test and Certification Body for Electrical Engineering at BG ETEM}\date{\today; version 1.31 (release)}


% Pygments definitions
\makeatletter
\def\PY@reset{\let\PY@it=\relax \let\PY@bf=\relax%
    \let\PY@ul=\relax \let\PY@tc=\relax%
    \let\PY@bc=\relax \let\PY@ff=\relax}
\def\PY@tok#1{\csname PY@tok@#1\endcsname}
\def\PY@toks#1+{\ifx\relax#1\empty\else%
    \PY@tok{#1}\expandafter\PY@toks\fi}
\def\PY@do#1{\PY@bc{\PY@tc{\PY@ul{%
    \PY@it{\PY@bf{\PY@ff{#1}}}}}}}
\def\PY#1#2{\PY@reset\PY@toks#1+\relax+\PY@do{#2}}

\@namedef{PY@tok@w}{\def\PY@tc##1{\textcolor[rgb]{0.73,0.73,0.73}{##1}}}
\@namedef{PY@tok@c}{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.24,0.48,0.48}{##1}}}
\@namedef{PY@tok@cp}{\def\PY@tc##1{\textcolor[rgb]{0.61,0.40,0.00}{##1}}}
\@namedef{PY@tok@k}{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
\@namedef{PY@tok@kp}{\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
\@namedef{PY@tok@kt}{\def\PY@tc##1{\textcolor[rgb]{0.69,0.00,0.25}{##1}}}
\@namedef{PY@tok@o}{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
\@namedef{PY@tok@ow}{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.67,0.13,1.00}{##1}}}
\@namedef{PY@tok@nb}{\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
\@namedef{PY@tok@nf}{\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,1.00}{##1}}}
\@namedef{PY@tok@nc}{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,1.00}{##1}}}
\@namedef{PY@tok@nn}{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,1.00}{##1}}}
\@namedef{PY@tok@ne}{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.80,0.25,0.22}{##1}}}
\@namedef{PY@tok@nv}{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}}
\@namedef{PY@tok@no}{\def\PY@tc##1{\textcolor[rgb]{0.53,0.00,0.00}{##1}}}
\@namedef{PY@tok@nl}{\def\PY@tc##1{\textcolor[rgb]{0.46,0.46,0.00}{##1}}}
\@namedef{PY@tok@ni}{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.44,0.44,0.44}{##1}}}
\@namedef{PY@tok@na}{\def\PY@tc##1{\textcolor[rgb]{0.41,0.47,0.13}{##1}}}
\@namedef{PY@tok@nt}{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
\@namedef{PY@tok@nd}{\def\PY@tc##1{\textcolor[rgb]{0.67,0.13,1.00}{##1}}}
\@namedef{PY@tok@s}{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
\@namedef{PY@tok@sd}{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
\@namedef{PY@tok@si}{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.64,0.35,0.47}{##1}}}
\@namedef{PY@tok@se}{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.67,0.36,0.12}{##1}}}
\@namedef{PY@tok@sr}{\def\PY@tc##1{\textcolor[rgb]{0.64,0.35,0.47}{##1}}}
\@namedef{PY@tok@ss}{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}}
\@namedef{PY@tok@sx}{\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
\@namedef{PY@tok@m}{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
\@namedef{PY@tok@gh}{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,0.50}{##1}}}
\@namedef{PY@tok@gu}{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.50,0.00,0.50}{##1}}}
\@namedef{PY@tok@gd}{\def\PY@tc##1{\textcolor[rgb]{0.63,0.00,0.00}{##1}}}
\@namedef{PY@tok@gi}{\def\PY@tc##1{\textcolor[rgb]{0.00,0.52,0.00}{##1}}}
\@namedef{PY@tok@gr}{\def\PY@tc##1{\textcolor[rgb]{0.89,0.00,0.00}{##1}}}
\@namedef{PY@tok@ge}{\let\PY@it=\textit}
\@namedef{PY@tok@gs}{\let\PY@bf=\textbf}
\@namedef{PY@tok@gp}{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,0.50}{##1}}}
\@namedef{PY@tok@go}{\def\PY@tc##1{\textcolor[rgb]{0.44,0.44,0.44}{##1}}}
\@namedef{PY@tok@gt}{\def\PY@tc##1{\textcolor[rgb]{0.00,0.27,0.87}{##1}}}
\@namedef{PY@tok@err}{\def\PY@bc##1{{\setlength{\fboxsep}{\string -\fboxrule}\fcolorbox[rgb]{1.00,0.00,0.00}{1,1,1}{\strut ##1}}}}
\@namedef{PY@tok@kc}{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
\@namedef{PY@tok@kd}{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
\@namedef{PY@tok@kn}{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
\@namedef{PY@tok@kr}{\let\PY@bf=\textbf\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
\@namedef{PY@tok@bp}{\def\PY@tc##1{\textcolor[rgb]{0.00,0.50,0.00}{##1}}}
\@namedef{PY@tok@fm}{\def\PY@tc##1{\textcolor[rgb]{0.00,0.00,1.00}{##1}}}
\@namedef{PY@tok@vc}{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}}
\@namedef{PY@tok@vg}{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}}
\@namedef{PY@tok@vi}{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}}
\@namedef{PY@tok@vm}{\def\PY@tc##1{\textcolor[rgb]{0.10,0.09,0.49}{##1}}}
\@namedef{PY@tok@sa}{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
\@namedef{PY@tok@sb}{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
\@namedef{PY@tok@sc}{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
\@namedef{PY@tok@dl}{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
\@namedef{PY@tok@s2}{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
\@namedef{PY@tok@sh}{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
\@namedef{PY@tok@s1}{\def\PY@tc##1{\textcolor[rgb]{0.73,0.13,0.13}{##1}}}
\@namedef{PY@tok@mb}{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
\@namedef{PY@tok@mf}{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
\@namedef{PY@tok@mh}{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
\@namedef{PY@tok@mi}{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
\@namedef{PY@tok@il}{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
\@namedef{PY@tok@mo}{\def\PY@tc##1{\textcolor[rgb]{0.40,0.40,0.40}{##1}}}
\@namedef{PY@tok@ch}{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.24,0.48,0.48}{##1}}}
\@namedef{PY@tok@cm}{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.24,0.48,0.48}{##1}}}
\@namedef{PY@tok@cpf}{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.24,0.48,0.48}{##1}}}
\@namedef{PY@tok@c1}{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.24,0.48,0.48}{##1}}}
\@namedef{PY@tok@cs}{\let\PY@it=\textit\def\PY@tc##1{\textcolor[rgb]{0.24,0.48,0.48}{##1}}}

\def\PYZbs{\char`\\}
\def\PYZus{\char`\_}
\def\PYZob{\char`\{}
\def\PYZcb{\char`\}}
\def\PYZca{\char`\^}
\def\PYZam{\char`\&}
\def\PYZlt{\char`\<}
\def\PYZgt{\char`\>}
\def\PYZsh{\char`\#}
\def\PYZpc{\char`\%}
\def\PYZdl{\char`\$}
\def\PYZhy{\char`\-}
\def\PYZsq{\char`\'}
\def\PYZdq{\char`\"}
\def\PYZti{\char`\~}
% for compatibility with earlier versions
\def\PYZat{@}
\def\PYZlb{[}
\def\PYZrb{]}
\makeatother


    % For linebreaks inside Verbatim environment from package fancyvrb. 
    \makeatletter
        \newbox\Wrappedcontinuationbox 
        \newbox\Wrappedvisiblespacebox 
        \newcommand*\Wrappedvisiblespace {\textcolor{red}{\textvisiblespace}} 
        \newcommand*\Wrappedcontinuationsymbol {\textcolor{red}{\llap{\tiny$\m@th\hookrightarrow$}}} 
        \newcommand*\Wrappedcontinuationindent {3ex } 
        \newcommand*\Wrappedafterbreak {\kern\Wrappedcontinuationindent\copy\Wrappedcontinuationbox} 
        % Take advantage of the already applied Pygments mark-up to insert 
        % potential linebreaks for TeX processing. 
        %        {, <, #, %, $, ' and ": go to next line. 
        %        _, }, ^, &, >, - and ~: stay at end of broken line. 
        % Use of \textquotesingle for straight quote. 
        \newcommand*\Wrappedbreaksatspecials {% 
            \def\PYGZus{\discretionary{\char`\_}{\Wrappedafterbreak}{\char`\_}}% 
            \def\PYGZob{\discretionary{}{\Wrappedafterbreak\char`\{}{\char`\{}}% 
            \def\PYGZcb{\discretionary{\char`\}}{\Wrappedafterbreak}{\char`\}}}% 
            \def\PYGZca{\discretionary{\char`\^}{\Wrappedafterbreak}{\char`\^}}% 
            \def\PYGZam{\discretionary{\char`\&}{\Wrappedafterbreak}{\char`\&}}% 
            \def\PYGZlt{\discretionary{}{\Wrappedafterbreak\char`\<}{\char`\<}}% 
            \def\PYGZgt{\discretionary{\char`\>}{\Wrappedafterbreak}{\char`\>}}% 
            \def\PYGZsh{\discretionary{}{\Wrappedafterbreak\char`\#}{\char`\#}}% 
            \def\PYGZpc{\discretionary{}{\Wrappedafterbreak\char`\%}{\char`\%}}% 
            \def\PYGZdl{\discretionary{}{\Wrappedafterbreak\char`\$}{\char`\$}}% 
            \def\PYGZhy{\discretionary{\char`\-}{\Wrappedafterbreak}{\char`\-}}% 
            \def\PYGZsq{\discretionary{}{\Wrappedafterbreak\textquotesingle}{\textquotesingle}}% 
            \def\PYGZdq{\discretionary{}{\Wrappedafterbreak\char`\"}{\char`\"}}% 
            \def\PYGZti{\discretionary{\char`\~}{\Wrappedafterbreak}{\char`\~}}% 
        } 
        % Some characters . , ; ? ! / are not pygmentized. 
        % This macro makes them "active" and they will insert potential linebreaks 
        \newcommand*\Wrappedbreaksatpunct {% 
            \lccode`\~`\.\lowercase{\def~}{\discretionary{\hbox{\char`\.}}{\Wrappedafterbreak}{\hbox{\char`\.}}}% 
            \lccode`\~`\,\lowercase{\def~}{\discretionary{\hbox{\char`\,}}{\Wrappedafterbreak}{\hbox{\char`\,}}}% 
            \lccode`\~`\;\lowercase{\def~}{\discretionary{\hbox{\char`\;}}{\Wrappedafterbreak}{\hbox{\char`\;}}}% 
            \lccode`\~`\:\lowercase{\def~}{\discretionary{\hbox{\char`\:}}{\Wrappedafterbreak}{\hbox{\char`\:}}}% 
            \lccode`\~`\?\lowercase{\def~}{\discretionary{\hbox{\char`\?}}{\Wrappedafterbreak}{\hbox{\char`\?}}}% 
            \lccode`\~`\!\lowercase{\def~}{\discretionary{\hbox{\char`\!}}{\Wrappedafterbreak}{\hbox{\char`\!}}}% 
            \lccode`\~`\/\lowercase{\def~}{\discretionary{\hbox{\char`\/}}{\Wrappedafterbreak}{\hbox{\char`\/}}}% 
            \catcode`\.\active
            \catcode`\,\active 
            \catcode`\;\active
            \catcode`\:\active
            \catcode`\?\active
            \catcode`\!\active
            \catcode`\/\active 
            \lccode`\~`\~ 	
        }
    \makeatother

    \let\OriginalVerbatim=\Verbatim
    \makeatletter
    \renewcommand{\Verbatim}[1][1]{%
        %\parskip\z@skip
        \sbox\Wrappedcontinuationbox {\Wrappedcontinuationsymbol}%
        \sbox\Wrappedvisiblespacebox {\FV@SetupFont\Wrappedvisiblespace}%
        \def\FancyVerbFormatLine ##1{\hsize\linewidth
            \vtop{\raggedright\hyphenpenalty\z@\exhyphenpenalty\z@
                \doublehyphendemerits\z@\finalhyphendemerits\z@
                \strut ##1\strut}%
        }%
        % If the linebreak is at a space, the latter will be displayed as visible
        % space at end of first line, and a continuation symbol starts next line.
        % Stretch/shrink are however usually zero for typewriter font.
        \def\FV@Space {%
            \nobreak\hskip\z@ plus\fontdimen3\font minus\fontdimen4\font
            \discretionary{\copy\Wrappedvisiblespacebox}{\Wrappedafterbreak}
            {\kern\fontdimen2\font}%
        }%
        
        % Allow breaks at special characters using \PYG... macros.
        \Wrappedbreaksatspecials
        % Breaks at punctuation characters . , ; ? ! and / need catcode=\active 	
        \OriginalVerbatim[#1,codes*=\Wrappedbreaksatpunct]%
    }
    \makeatother

    % Exact colors from NB
    \definecolor{incolor}{HTML}{303F9F}
    \definecolor{outcolor}{HTML}{D84315}
    \definecolor{cellborder}{HTML}{CFCFCF}
    \definecolor{cellbackground}{HTML}{F7F7F7}
    
    % prompt
    \makeatletter
    \newcommand{\boxspacing}{\kern\kvtcb@left@rule\kern\kvtcb@boxsep}
    \makeatother
    \newcommand{\prompt}[4]{
        {\ttfamily\llap{{\color{#2}[#3]:\hspace{3pt}#4}}\vspace{-\baselineskip}}
    }
    

    % Prevent overflowing lines due to hard-to-break entities
    \sloppy

    % Setup hyperref package
    \hypersetup{
      breaklinks=true,  % so long urls are correctly broken across lines
      bookmarksnumbered=true,
      pdfauthor=Dipl.-Ing. Bj\"orn Kasper,
      pdftitle=Getting started with Machine Learning (ML) and Support Vector Classifiers (SVC) - A systematic step-by-step approach,
      colorlinks=true,
      urlcolor=urlcolor,
      linkcolor=linkcolor,
      citecolor=citecolor,
      pdfpagemode={UseOutlines},
      pdfview = {XYZ},
      pdfstartview = {XYZ},
      pdfstartpage = {1},
      pdfborder={0 0 0}
      }
    % Slightly bigger margins than the latex defaults
    \geometry{verbose,tmargin=1in,bmargin=1in,lmargin=1in,rmargin=1in}


\begin{document}
    
    % Without changing the numbering style,
    % page numbers and column titles should be turned off.
    \pagestyle{empty}
    
    \maketitle\thispagestyle{empty}\begin{center}
        \includegraphics[width=0.90\textwidth]{images/Cover_image.pdf}
        \end{center}
        \vfill

    \begin{abstract}
    Anyone who wants to seriously deal with the emerging topic of our time ``Artificial Intelligence (AI)'' cannot avoid dealing with the basic mathematical models and algorithms from the field of ``Machine Learning (ML)'' as a subset of AI. However, someone who opens the door for the first time to this equally very exciting as well as arbitrarily complex and, at first glance, confusing world will very quickly be overwhelmed. Here, it is a good idea to consult introductory and systematic tutorials. Therefore, this Getting Started tutorial systematically demonstrates the typical ML work process step-by-step using the very powerful and performant ``Support Vector Classifier (SVC)'' and the widely known and exceptionally beginner-friendly ``Iris Dataset''. Furthermore, the selection of the ``correct'' SVC kernel and its parameters are described and their effects on the classification result are shown.
    \end{abstract}
    \vfill\noindent
    \begin{center}
	    \begin{tabular}{>{\centering}m{0.2\textwidth}m{0.65\textwidth}}
	    \begin{minipage}{\linewidth}
	        \includegraphics{images/CC_BY-SA_40.png}
	    \end{minipage}
	    &
	    \begin{minipage}{\linewidth}
	        This work is licensed under a \href{https://creativecommons.org/licenses/by-sa/4.0/}{Creative Commons Attribution-ShareAlike 4.0 International License (CC BY-SA 4.0)}.
	    \end{minipage}
	    \end{tabular}
	\end{center}

    \newpage

    % Activate own page style
    \pagestyle{fancy}
    % Delete all fields
    \fancyhf{}
    % \fancyhead[EL,OL]{$header$}
    % \fancyfoot[EL,OL]{$footer$}
    % Header leftside: chapter/section
    \fancyhead[ER,OR]{\leftmark}
    % Footer rightside: page number
    \fancyfoot[ER,OR]{Seite \thepage}

    \renewcommand{\sectionmark}[1]{
        \markboth{\thesection{} #1}{}
    }

    
    \tableofcontents
    
    
    \hypertarget{introduction}{%
\section{Introduction}\label{introduction}}

    \hypertarget{english-introduction}{%
\subsection{English introduction}\label{english-introduction}}

\hypertarget{ai-and-ml-in-the-digitalized-working-world}{%
\subsubsection{AI and ML in the digitalized working
world}\label{ai-and-ml-in-the-digitalized-working-world}}

In the \textbf{digitized work environment}, there is an increasing
demand for \textbf{Work equipment} to be able to adapt independently and
in a task-related manner to changing work situations. Depending on the
strength of the degree of flexibility, this \textbf{situational
adaptivity} can often only be realized by applying mathematical models
and algorithms from the field of \textbf{Machine Learning (ML)} as a
subset of \textbf{Artificial Intelligence (AI)}.

\hypertarget{automation-and-autonomy}{%
\subsubsection{Automation and autonomy}\label{automation-and-autonomy}}

Examples of such AI applications in work environments can range from
comparatively simple \textbf{voice assistance systems} (similar, for
example, to Siri or Alexa from the private sphere) to partially or
\textbf{highly automated systems}. The transition from
\textbf{automation to autonomy} is currently the subject of much
controversy among experts and can be viewed in terms of the transition
of responsibility from humans to technical systems (\cite{Adler_2021};
\cite{Adler_2019}).

By definition, a system is called \textbf{autonomous} only when it can
achieve a given goal \textbf{independently} and adapted to the situation
\textbf{without human control} or detailed \textbf{programming}
(\cite{EFI_autSysteme_2018}; \cite{acatech_2017}).

However, the distinction between the degree of automation and the
autonomy of a technical system is relatively vague and difficult to
define, depending on the technical context and the degree of
abstraction. Crucial for the classification are the degrees of
\textbf{self-determination}, \textbf{independence} as well as the
\textbf{freedom of decision or action} of a technical system towards
\textbf{human intervention} or preprogrammed behavior patterns (vgl.
\cite{Wiki_Autonomie}).

In contrast to highly automated systems, autonomous systems are only
able to act autonomously, solve problems, and learn to constantly
improve in the process through the use of AI algorithms
(\cite{acatech_2017}).

For example, \textbf{driverless transport systems (AGVs)} can navigate
\textbf{autonomously} through larger industrial facilities using
self-learned self-updated maps shared with other AGVs, and avoid
location-changing obstacles by independently finding and optimizing
suitable routes. However, at a higher level of abstraction, new
logistics tasks are given to them by human operators, which is why AGVs
tend to be \textbf{highly automated systems} from a human perspective.

\hypertarget{operating-and-safety-functions}{%
\subsubsection{Operating and safety
functions}\label{operating-and-safety-functions}}

In addition to the many very interesting advantages, e.g.~in terms of
economic efficiency and workload reduction, such highly automated
systems and, depending on the point of view, autonomous subsystems are
characterized by a very high level of technical complexity. This
concerns both their \textbf{operating functions} (e.g.~autonomous
navigation through complex industrial environments with shared use of
the roadways by other human-controlled vehicles) and their
\textbf{safety functions} (e.g.~evaluation of interlinked imaging and
non-imaging safety sensors for monitoring the driving space to avoid
collisions).

\hypertarget{requirements-for-safety-functions}{%
\subsubsection{Requirements for safety
functions}\label{requirements-for-safety-functions}}

Very high requirements are placed on such autonomous systems and the AI
algorithms used for this purpose with regard to \textbf{functional
safety}. However, the requirements for safety evaluability in terms of
\textbf{transparency} (complete understanding of the system) and
\textbf{explainability} of decisions made by AI are currently very
difficult or impossible to achieve, especially when using AI algorithms
from the field of \textbf{deep learning} (\cite{Liggesmeyer_2019}).

Unlike automated systems, the functionality of AI-powered autonomous
systems is not fully programmed out before operational use, but is
created by applying algorithms with learning capabilities to data. This
results in a model that is merely executed by the software at runtime.
Due to its \textbf{inherent complexity}, the resulting model is
generally \textbf{not comprehensible} to humans, which means that the
\textbf{decisions} of an AI system are often \textbf{not transparent}.
Although the requirements for the AI system typically cannot be fully
described, it must still function reliably later at runtime in a very
large application space (\cite{Schneider_2021}). This pushes today's
established methods and techniques of systematic software design and
testing of safety-related software to their limits (cf.~\textbf{V model}
according to \cite{DIN_EN_61508-3_2011-02}).

Furthermore, in terms of their \textbf{recognition rates} and thus the
\textbf{reliability of their decisions}, today's AI algorithms very
often do not meet the functional safety requirements to achieve higher
safety levels, even under the most favorable conditions. For example, a
software-based safety function with a performance level d \((PL_{d})\)
typically required for machines in accordance with ISO 13849-1 may only
fail dangerously with a probability of \(10^{-7} - 10^{-6}\) per hour
during continuous use (see table K.1 in \cite{DIN_EN_ISO_13849-1_2016}).

Compared to traditional, fully programmed software, the relatively low
robustness of data-driven algorithms from the field of deep learning is
another challenge. This can cause \textbf{small changes} in the
function-determining \textbf{training data} to cause \textbf{large and
unpredictable changes} in system behavior under some circumstances.
However, the \textbf{predictability} and \textbf{transparency} of the
system behavior are elementary for a \textbf{safety verification}
(\cite{BAuA_Rechtsgutachten_KI_2021}).

\hypertarget{occupational-safety-and-health-placing-on-the-market-law-and-occupational-safety-and-health-law}{%
\subsubsection{Occupational safety and health: placing on the market law
and occupational safety and health
law}\label{occupational-safety-and-health-placing-on-the-market-law-and-occupational-safety-and-health-law}}

An appropriate assessment or even \textbf{testing} with regard to the
required functional safety according to uniform and ideally standardized
criteria has numerous consequences for the future orientation and
organization of technical \textbf{occupational safety and health (OSH)}
in Germany and in Europe. In addition to the currently still very
difficult safety-related assessability, an important point is that the
previous clear separation between \textbf{placing on the market law}
(see e.g.~Machinery Directive) and \textbf{occupational safety and
health law} (see European Framework Directive for Occupational Safety
and Health and German Ordinance on Occupational Safety and Health) can
no longer be continued in this way. The reason for this is that
\textbf{safety-related properties} will also change, especially of
systems \textbf{continuously learning} at runtime, due to new or
\textbf{adapted behaviors} learned during operation
(\cite{BAuA_Rechtsgutachten_KI_2021}). From today's point of view,
systems based on \textbf{learned-out} and at runtime \textbf{invariable
models} are not affected by this.

\hypertarget{evaluation-of-systems-capable-of-learning}{%
\subsubsection{Evaluation of systems capable of
learning}\label{evaluation-of-systems-capable-of-learning}}

For these reasons, especially the actors of \textbf{technical
occupational safety and health} who will deal with the
\textbf{evaluation} of such \textbf{systems capable of learning} or
system components with AI algorithms in the future should familiarize
themselves in depth with the software structures used for this purpose
as early as possible. This is the only way to ensure that the rapid
development of systems capable of learning can be accompanied by OSH and
their testing authorities in a constructive, critical and technically
appropriate manner. If this is omitted, it must be assumed on the basis
of the experiences of recent years that the OSH system will be
ruthlessly circumvented or undermined by the economic interests of
globally operating software giants. This would have the consequence that
serious or fatal \textbf{occupational accidents} are more likely to
occur \textbf{due to inadequately designed AI-based work systems}.

\hypertarget{entry-into-the-world-of-ml}{%
\subsubsection{Entry into the world of
ML}\label{entry-into-the-world-of-ml}}

However, the safety-related evaluation of such learning-capable systems
requires a more in-depth technical entry into the world of
\textbf{machine learning} as a subfield of \textbf{artificial
intelligence}. For this purpose, it is necessary to deal with the basic
operation of typical ML algorithms, corresponding software tools,
libraries and programming systems.

However, someone who opens the door for the first time to this equally
very exciting as well as arbitrarily complex and, at first glance,
confusing world will very quickly be overwhelmed. In addition to reading
general technical literature, it is advisable to consult introductory
and systematic tutorials.

\hypertarget{goals-of-the-getting-started-tutorial}{%
\subsubsection{Goals of the Getting Started
Tutorial}\label{goals-of-the-getting-started-tutorial}}

This Getting Started tutorial has exactly this goal, demonstrating
systematically and step-by-step the typical ML workflow using the very
powerful \textbf{Support Vector Classifier (SVC)} as an example.

Besides the \textbf{deep neural networks}, which are very present in the
media, there is a very rich diversity of other very powerful ML
algorithms - suitable for the particular use case. For a more generally
comprehensible introduction, the SVC algorithm was deliberately chosen
for the target audience of the workshop. Its operating principles are
easy to convey to ML novices as well as in the time frame given for the
workshop - quite in contrast to the entry into the world of deep neural
networks.

\hypertarget{ml-workflow-as-a-step-by-step-guide}{%
\subsubsection{ML workflow as a step-by-step
guide}\label{ml-workflow-as-a-step-by-step-guide}}

The following main sections will demonstrate the typical ML workflow
step-by-step. In \textbf{step 0}, specific guidance is provided for
selecting hardware and software suitable for machine learning. To allow
an ML novice to first familiarize themselves with the ML algorithms,
tools, libraries, and programming systems, the ready-made and very
beginner-friendly \textbf{Iris dataset} is involved in \textbf{step 1}.
Only after a comprehensive acquaintance with the application of ML tools
would it make sense to examine one's own environment for ML-suitable
applications and to obtain suitable datasets from them. However, this is
beyond the scope of this introductory tutorial.

One of the most important steps in the entire ML process is \textbf{step
2}, in which the dataset included in step 1 is examined using typical
data analysis tools. In addition to exploring the \textbf{data
structure} and \textbf{internal correlations} in the dataset, errors
such as gaps, duplications, or obvious misentries must also be found and
corrected where possible. This is enormously important so that the
classification can later provide plausible results.

After exploring the dataset, in \textbf{step 3} one has to decide on a
specific ML algorithm based on certain selection criteria. Among other
ML algorithms suitable for the Iris dataset (such as the
decision-tree-based \textbf{random-forests classifier}), the reasoned
choice here in the tutorial falls on the \textbf{support vector
classifier}. A dedicated SVC model is now being implemented.

In \textbf{step 4} the dataset is preprocessed for the actual
classification by SVC. Depending on the selected ML algorithm as well as
the data structure, it may be necessary to prepare the data before
training (e.g., by standardization or normalization). After splitting
the dataset into a training and test dataset, the SVC model is trained
with the training dataset in \textbf{step 5}. Subsequently,
classification predictions are made with the trained SVC model based on
the test data. In \textbf{step 6}, the quality of the classification
result is evaluated using known \textbf{metrics} such as the
\textbf{confusion matrix}.

Since the classification in step 5 was initially performed with standard
parameters (so-called \textbf{hyper-parameters}), their meaning is
explained in \textbf{step 7} and then their effect on the classification
result is demonstrated by manually varying the individual
hyper-parameters.

In the final \textbf{step 8}, two approaches to systematic
hyper-parameter search are presented: \textbf{Grid Search} and
\textbf{Randomized Search}. While the former exhaustively considers all
parameter combinations for given values, the latter selects a number of
candidates from a parameter space with a particular random distribution.

\hypertarget{presentation-at-the-artificial-intelligence-conference-in-2022}{%
\subsubsection{Presentation at the ``Artificial Intelligence''
conference in
2022}\label{presentation-at-the-artificial-intelligence-conference-in-2022}}

In November 2022, the \textbf{Artificial Intelligence Conference} took
place in Dresden, which was hosted by the German Social Accident
Insurance (DGUV). There, the current tutorial was presented to
interested ML newcomers in the technical occupational safety and health
of the social accident insurance institutions as part of a separate
\textbf{Getting Started Workshop}.

    \hypertarget{german-introduction}{%
\subsection{German introduction}\label{german-introduction}}

\hypertarget{ki-und-ml-in-der-digitalisierten-arbeitswelt}{%
\subsubsection{KI und ML in der digitalisierten
Arbeitswelt}\label{ki-und-ml-in-der-digitalisierten-arbeitswelt}}

Von den \textbf{Arbeitsmitteln} in der \textbf{digitalisierten
Arbeitswelt} wird immer stärker gefordert, dass sie sich selbstständig
und aufgabenbezogen an sich ändernde Arbeitssituationen anpassen können.
Diese \textbf{situative Adaptivität} kann je nach Stärke des
Flexibilisierungsgrades oft nur durch die Anwendung mathematischer
Modelle und Algorithmen aus dem Bereich des \textbf{Maschinellen Lernens
(ML)} als Teilmenge der \textbf{Künstlichen Intelligenz (KI)} realisiert
werden.

\hypertarget{automatisierung-und-autonomie}{%
\subsubsection{Automatisierung und
Autonomie}\label{automatisierung-und-autonomie}}

Beispiele für solche KI-Anwendungen in der Arbeitswelt reichen von
vergleichsweise einfachen \textbf{Sprachassistenzsystemen} (ähnlich z.
B. Siri oder Alexa aus dem privaten Umfeld) bis hin zu teil- oder
\textbf{hochautomatisierten Systemen}. Der Übergang von
\textbf{Automatisierung zu Autonomie} wird derzeit in der Fachwelt sehr
kontrovers diskutiert und kann unter dem Aspekt des Übergangs der
Verantwortung vom Menschen zum technischen System betrachtet werden
(\cite{Adler_2021}; \cite{Adler_2019}).

Definitionsgemäß wird ein System erst dann als \textbf{autonom}
bezeichnet, wenn es \textbf{ohne menschliche Steuerung} oder
detaillierte \textbf{Programmierung} ein vorgegebenes Ziel
\textbf{selbstständig} und an die Situation angepasst erreichen kann
(\cite{EFI_autSysteme_2018}; \cite{acatech_2017}).

Allerdings ist die Unterscheidung des Grades der Automatisierung bis hin
zur Autonomie eines technischen Systems relativ fließend und je nach
fachlichem Kontext und Abstraktionsgrad nur schwer zu definieren.
Maßgeblich für die Einordnung sind die Grade der
\textbf{Selbstbestimmtheit}, die \textbf{Unabhängigkeit} sowie die
\textbf{Entscheidungs- bzw. Handlungsfreiheit} eines technischen Systems
gegenüber \textbf{menschlichem Eingriff} oder vorprogrammierter
Verhaltensmuster (vgl. \cite{Wiki_Autonomie}).

Im Gegensatz zu hochautomatisierten Systemen sind autonome Systeme nur
durch Einsatz von KI-Algorithmen in der Lage, eigenständig zu agieren,
Probleme zu lösen und dabei zu lernen, sich ständig zu verbessern
(\cite{acatech_2017}).

Beispielsweise können \textbf{fahrerlose Transportsysteme (FTS)} anhand
selbst erlernter, selbstständig aktualisierter und mit anderen FTS
geteilter Karten \textbf{autonom} durch größere Industrieanlagen
navigieren und ortsveränderlichen Hindernissen ausweichen, indem sie
selbstständig geeignete Routen finden und optimieren. Jedoch werden
ihnen in einer höheren Abstraktionsebene neue Logistikaufträge durch
menschliche Bediener vorgegeben, weswegen es sich bei FTS aus
menschlicher Perspektive eher um \textbf{hochautomatisierte Systeme}
handelt.

\hypertarget{betriebs--und-sicherheitsfunktionen}{%
\subsubsection{Betriebs- und
Sicherheitsfunktionen}\label{betriebs--und-sicherheitsfunktionen}}

Neben den vielen sehr interessanten Vorteilen z. B. bzgl.
Wirtschaftlichkeit und Arbeitserleichterung kennzeichnet solche
hochautomatisierten und je nach Betrachtung autonomen Teilsysteme eine
sehr hohe technische Komplexität. Diese betrifft sowohl ihre
\textbf{Betriebsfunktionen} (z. B. autonome Navigation durch komplexe
industrielle Umgebungen bei gemeinsamer Nutzung der Fahrwege durch
andere menschlich gesteuerte Fahrzeuge) als auch ihre
\textbf{Sicherheitsfunktionen} (z. B. Auswertung miteinander verknüpfter
bildgebender und nicht-bildgebender Sicherheitssensorik zur Überwachung
des Fahrraums zur Kollisionsvermeidung).

\hypertarget{anforderungen-an-sicherheitsfunktionen}{%
\subsubsection{Anforderungen an
Sicherheitsfunktionen}\label{anforderungen-an-sicherheitsfunktionen}}

An solche autonomen Systeme und die hierfür eingesetzten KI-Algorithmen
werden sehr hohe Anforderungen hinsichtlich der \textbf{funktionalen
Sicherheit} gestellt. Jedoch sind die Anforderungen für eine
sicherheitstechnische Bewertbarkeit bezüglich der \textbf{Transparenz}
(vollständiges Systemverständnis) und \textbf{Erklärbarkeit} der durch
KI getroffenen Entscheidungen insbesondere bei Einsatz von
KI-Algorithmen aus dem Bereich des \textbf{Deep Learnings} derzeit nur
sehr schwer oder gar nicht erreichbar (\cite{Liggesmeyer_2019}).

Im Gegensatz zu automatisierten Systemen wird die Funktionalität
KI-gestützter autonomer Systeme nicht vor der betrieblichen Verwendung
vollständig ausprogrammiert, sondern durch das Anwenden lernfähiger
Algorithmen auf Daten erstellt. Dadurch entsteht ein Modell, das von der
Software zur Laufzeit lediglich ausgeführt wird. Das resultierende
Modell ist aufgrund seiner \textbf{inhärenten Komplexität} im
Allgemeinen \textbf{für den Menschen nicht verständlich}, wodurch die
\textbf{Entscheidungen} eines KI-Systems oft \textbf{nicht transparent}
sind. Obwohl die Anforderungen an das KI-System typischerweise nicht
vollständig beschrieben werden können, muss es später zur Laufzeit in
einem sehr großen Anwendungsraum trotzdem verlässlich funktionieren
(\cite{Schneider_2021}). Dadurch kommen die heute etablierten Methoden
und Techniken des systematischen Softwareentwurfes und -testens
sicherheitsgerichteter Software an ihre Grenzen (vgl. \textbf{V-Modell}
nach \cite{DIN_EN_61508-3_2011-02}).

Weiterhin erfüllen heutige KI-Algorithmen hinsichtlich ihrer
erreichbaren \textbf{Erkennungsraten} und damit der
\textbf{Zuverlässigkeiten ihrer Entscheidungen} selbst unter günstigsten
Bedingungen sehr oft nicht die Anforderungen an die funktionale
Sicherheit, um höhere Safety-Level zu erreichen. Beispielsweise darf
eine software-gestützte Sicherheitsfunktion mit einem für Maschinen
typischerweise geforderten Performance Level d \((PL_{d})\) nach ISO
13849-1 bei kontinuierlicher Nutzung nur mit einer Wahrscheinlichkeit
von \(10^{-7} - 10^{-6}\) pro Stunde gefährlich ausfallen (siehe Tabelle
K.1 in \cite{DIN_EN_ISO_13849-1_2016}).

Im Vergleich zu traditioneller, vollständig ausprogrammierter Software
ist bei datengetriebenen Algorithmen aus dem Bereich des Deep Learnings
die verhältnismäßig geringe Robustheit eine weitere Herausforderung.
Diese kann dazu führen, dass \textbf{kleine Änderungen} in den
funktionsbestimmenden \textbf{Trainingsdaten} unter Umständen
\textbf{große und unvorhersehbare Veränderungen} des Systemverhaltens
bewirken. Jedoch sind die \textbf{Vorhersehbarkeit} und
\textbf{Nachvollziehbarkeit} des Systemverhaltens für einen
\textbf{Sicherheitsnachweis} elementar
(\cite{BAuA_Rechtsgutachten_KI_2021}).

\hypertarget{arbeitsschutz-inverkehrbringensrecht-und-betrieblicher-arbeitsschutz}{%
\subsubsection{Arbeitsschutz: Inverkehrbringensrecht und betrieblicher
Arbeitsschutz}\label{arbeitsschutz-inverkehrbringensrecht-und-betrieblicher-arbeitsschutz}}

Eine hinsichtlich der geforderten funktionalen Sicherheit angemessene
Bewertung oder gar \textbf{Prüfung} nach einheitlichen und idealerweise
genormten Maßstäben hat viele Konsequenzen für die zukünftige
Ausrichtung und Gestaltung des \textbf{technischen Arbeitsschutzes} in
Deutschland und in Europa. Neben der derzeit noch sehr schwierigen
sicherheitstechnischen Bewertbarkeit von KI-Algorithmen ist ein
wichtiger Punkt, dass die bisherige klare Trennung zwischen
\textbf{Inverkehrbringensrecht} (siehe z. B. Maschinenrichtlinie) und
\textbf{betrieblichem Arbeitsschutzrecht} (siehe
Arbeitsschutz-Rahmenrichtlinie und Betriebssicherheitsverordnung) so
nicht mehr aufrechterhalten werden kann. Grund hierfür ist, dass sich
auch die \textbf{sicherheitsrelevanten Eigenschaften} insbesondere von
zur Laufzeit \textbf{weiterlernenden Systemen} durch während des
Betriebs erlernte, neue oder \textbf{angepasste Verhaltensweisen}
verändern werden (\cite{BAuA_Rechtsgutachten_KI_2021}). Systeme auf
Basis \textbf{ausgelernter} und zur Laufzeit \textbf{unveränderlicher
Modelle} sind aus heutiger Sicht hiervon nicht betroffen.

\hypertarget{pruxfcfung-lernfuxe4higer-systeme}{%
\subsubsection{Prüfung lernfähiger
Systeme}\label{pruxfcfung-lernfuxe4higer-systeme}}

Aus diesen Gründen sollten sich insbesondere die Akteure des
\textbf{technischen Arbeitsschutzes}, die sich zukünftig mit der
\textbf{Prüfung} solcher \textbf{lernfähigen Systeme} oder
Systemkomponenten mit KI-Algorithmen befassen werden, möglichst
frühzeitig mit den hierfür eingesetzten Software-Strukturen vertieft
auseinandersetzen. Nur dadurch lässt sich erreichen, dass die stürmische
Entwicklung lernfähiger Systeme durch den Arbeitsschutz und dessen
Prüfinstitute konstruktiv, kritisch und fachlich angemessen begleitet
werden kann. Wird dies versäumt, muss aufgrund der Erfahrungen der
vergangenen Jahre davon ausgegangen werden, dass das Arbeitsschutzsystem
durch die wirtschaftlichen Interessen global agierender Softwaregiganten
skrupellos umgangen oder ausgehebelt werden wird. Dies hätte die Folge,
dass schwere oder tödliche \textbf{Arbeitsunfälle wegen unzulänglich
gestalteter KI-basierter Arbeitssysteme} wahrscheinlicher werden.

\hypertarget{einstieg-in-die-welt-des-ml}{%
\subsubsection{Einstieg in die Welt des
ML}\label{einstieg-in-die-welt-des-ml}}

Allerdings erfordert die sicherheitstechnische Bewertung solcher
lernfähigen Systeme einen tiefer gehenden fachlichen Einstieg in die
Welt des \textbf{maschinellen Lernens} als Teilgebiet der
\textbf{künstlichen Intelligenz}. Hierzu muss sich mit den grundlegenden
Funktionsweisen typischer ML-Algorithmen, entsprechenden
Software-Werkzeugen, Bibliotheken und Programmiersystemen auseinander
gesetzt werden.

Wer jedoch zum ersten Mal die Tür zu dieser ebenso spannenden wie
beliebig komplexen und auf den ersten Blick verwirrenden Welt öffnet,
wird sehr schnell überfordert sein. Hier empfiehlt es sich neben dem
Lesen allgemeiner Fachliteratur, einführende und systematische
Anleitungen zu Rate zu ziehen.

\hypertarget{ziele-des-getting-started-tutorials}{%
\subsubsection{Ziele des
Getting-Started-Tutorials}\label{ziele-des-getting-started-tutorials}}

Genau dieses Ziel verfolgt das vorliegende Getting-Started-Tutorial,
indem systematisch und Schritt-für-Schritt der typische ML-Arbeitsablauf
am Beispiel des sehr leistungsfähigen \textbf{Support Vector Classifier
(SVC)} demonstriert wird.

Neben den medial sehr präsenten \textbf{tiefen neuronalen Netzen} gibt
es eine sehr reichhaltige Auswahl anderer sehr leistungsfähiger
ML-Algorithmen - passend für den jeweiligen Anwendungsfall. Für einen
allgemein verständlicheren Einstieg wurde für die Zielgruppe des
Workshops der SVC-Algorithmus bewusst gewählt. Dessen Arbeitsweise ist
sowohl für ML-Neulinge als auch in dem für den Workshop vorgegebenen
Zeitrahmen leicht vermittelbar - ganz im Gegensatz zum Einstieg in die
Welt der tiefen neuronalen Netze.

\hypertarget{ml-arbeitsablauf-als-schritt-fuxfcr-schritt-anleitung}{%
\subsubsection{ML-Arbeitsablauf als
Schritt-für-Schritt-Anleitung}\label{ml-arbeitsablauf-als-schritt-fuxfcr-schritt-anleitung}}

Die folgenden Hauptabschnitte demonstrieren den typischen
ML-Arbeitsablauf Schritt-für-Schritt. Im \textbf{Schritt 0} werden
konkrete Hinweise für die Auswahl der für das maschinelle Lernen
geeigneten Hardware und Software gegeben. Damit sich ein ML-Neuling
zunächst mit den ML-Algorithmen, Werkzeugen, Bibliotheken und
Programmiersystemen vertraut machen kann, wird im \textbf{Schritt 1} der
fertige und sehr einsteigerfreundliche \textbf{Iris-Datensatz}
hinzugezogen. Erst nach einer umfassenden Einarbeitung in die Anwendung
der ML-Werkzeuge wäre es sinnvoll, die eigene Umgebung auf ML-taugliche
Anwendungen hin zu untersuchen und daraus geeignete Datensätze zu
gewinnen. Dies geht jedoch über den Rahmen dieses einführenden Tutorials
hinaus.

Mit der wichtigste Schritt im gesamten ML-Prozess ist \textbf{Schritt
2}, in dem der in Schritt 1 einbezogene Datensatz mit Hilfe typischer
Datenanalyse-Werkzeuge untersucht wird. Neben der Erkundung der
\textbf{Datenstruktur} sowie \textbf{innerer Zusammenhänge} im Datensatz
müssen auch Fehler wie z. B. Lücken, Dopplungen oder offensichtliche
Fehleingaben gefunden und nach Möglichkeit behoben werden. Dies ist
enorm wichtig, damit die Klassifikation später plausible Ergebnisse
liefern kann.

Nach der Erkundung des Datensatzes muss man sich im \textbf{Schritt 3}
anhand bestimmter Auswahlkriterien für einen konkreten ML-Algorithmus
entscheiden. Neben anderen für den Iris-Datensatz passenden
ML-Algorithmen (wie z. B. der entscheidungsbaum-basierte
\textbf{Random-forests-Classifier}) fällt die begründete Auswahl hier im
Tutorial auf den \textbf{Support-Vector-Classifier}. Ein entsprechendes
SVC-Modell wird nun implementiert.

Im \textbf{Schritt 4} wird der Datensatz für die eigentliche
Klassifikation per SVC vorbereitet. Je nach gewähltem ML-Algorithmus
sowie der Datenstruktur kann es erforderlich sein, dass die Daten vor
dem Training aufbereitet werden müssen (z. B. durch Standardisierung
oder Normalisierung). Nach der Aufteilung des Datensatzes in einen
Trainings- und Testdatensatz, wird das SVC-Modell im \textbf{Schritt 5}
mit dem Trainingsdatensatz trainiert. Anschließend werden mit dem
trainierten SVC-Modell anhand der Testdaten Klassifikationsvorhersagen
getroffen. Im \textbf{Schritt 6} wird die Güte des
Klassifikationsergebnisses anhand bekannter \textbf{Metriken} wie z. B.
der \textbf{Konfusionsmatrix} evaluiert.

Da die Klassifikation im Schritt 5 zunächst mit Standard-Parametern (den
sogenannten \textbf{Hyper-Parametern}) durchgeführt wurde, wird ihre
Bedeutung im \textbf{Schritt 7} erklärt und danach ihr Einfluss auf das
Klassifikationsergebnis durch manuelle Variation der einzelnen
Hyper-Parameter demonstriert.

Im abschließenden \textbf{Schritt 8} werden zwei Ansätze zur
systematischen Hyper-Parameter-Suche vorgestellt: \textbf{Grid Search}
und \textbf{Randomized Search}. Während bei ersterer für gegebene Werte
erschöpfend alle Parameterkombinationen betrachtet werden, wird beim
zweiten Ansatz eine Anzahl von Kandidaten aus einem Parameterraum mit
einer bestimmten zufälligen Verteilung ausgewählt.

\hypertarget{vorstellung-auf-der-fachtagung-kuxfcnstliche-intelligenz-in-2022}{%
\subsubsection{Vorstellung auf der Fachtagung ``Künstliche Intelligenz''
in
2022}\label{vorstellung-auf-der-fachtagung-kuxfcnstliche-intelligenz-in-2022}}

Im November 2022 fand die \textbf{Fachtagung ``Künstliche Intelligenz''}
in Dresden statt, welche durch die Deutsche Gesetzliche
Unfallversicherung (DGUV) ausgerichtet wurde. Dort wurde im Rahmen eines
eigenen \textbf{Getting-Started-Workshops} das vorliegende Tutorial
interessierten ML-Neulingen im technischen Arbeitsschutz der
gesetzlichen Unfallversicherungsträger präsentiert.

    \hypertarget{steps-of-the-systematic-ml-process}{%
\subsection{Steps of the systematic ML
process}\label{steps-of-the-systematic-ml-process}}

The following \textbf{steps of the systematic ML process} are covered in
the next main sections:

\begin{itemize}
\tightlist
\item
  \hyperref[step-0-select-hardware-and-software-suitable-for-ml]{STEP 0: Select hardware and software suitable for ML}
\item
  \hyperref[step-1-acquire-the-ml-dataset]{STEP 1: Acquire the ML dataset}
\item
  \hyperref[step-2-explore-the-ml-dataset]{STEP 2: Explore the ML dataset}
\item
  \hyperref[step-3-choose-and-create-the-ml-model]{STEP 3: Choose and create the ML model}
\item
  \hyperref[step-4-preprocess-the-dataset-for-training]{STEP 4: Preprocess the dataset for training}
\item
  \hyperref[step-5-carry-out-training-prediction-and-testing]{STEP 5: Carry out training, prediction and testing}
\item
  \hyperref[step-6-evaluate-models-performance]{STEP 6: Evaluate model’s performance}
\item
  \hyperref[step-7-vary-parameters-of-the-ml-model-manually]{STEP 7: Vary parameters of the ML model manually}
\item
  \hyperref[step-8-tune-the-ml-model-systematically]{STEP 8: Tune the ML model systematically}
\end{itemize}

    \hypertarget{step-0-select-hardware-and-software-suitable-for-ml}{%
\section{STEP 0: Select hardware and software suitable for
ML}\label{step-0-select-hardware-and-software-suitable-for-ml}}

In this step, specific guidance is provided for selecting hardware and
software suitable for machine learning.

    \hypertarget{community-support}{%
\subsection{Community Support}\label{community-support}}

When selecting and deciding for or against the use of certain hardware
and software components, in addition to purely technical or financial
characteristics, significant attention should be paid to broad
\textbf{support from a well-networked community}. This community should
consist of a balanced share of \textbf{manufacturers} of hardware
components (e.g.~GPU suppliers, manufacturers of embedded systems or
sensors), \textbf{software developers} ideally from the \textbf{open
source} ecosystem, and an active \textbf{user community} (e.g.~for
reporting hardware and software bugs or providing help in forums).

The author's many years of development experience show that the
technically best hardware or software component is worthless if you are
(apparently) the only user. This impression arises either because the
component is actually very exotic and has only a few users or because
the development takes place ``behind closed doors'', i.e.~in the
company's internal \textbf{closed source} domain.

Without the support of an active community, you are (almost) on your own
when it comes to questions or problems. Progress in the development and
maintenance of an AI application is therefore very difficult! The clear
recommendation is therefore: Go for the (technically, price-wise, etc.)
\textbf{second-best alternative} but with an even bigger
\textbf{community}.

    \hypertarget{hardware}{%
\subsection{Hardware}\label{hardware}}

When considering hardware requirements, two systems and their use cases
must be taken into account: the \textbf{training system} and the
\textbf{application system}.

\hypertarget{training-system}{%
\subsubsection{Training system}\label{training-system}}

The \textbf{training phase} requires a lot of \textbf{computational
power} and \textbf{memory (RAM)}, depending on the \textbf{amount of
data} to be processed and the \textbf{ML algorithm (so-called
estimator)} chosen.

Depending on the estimator model, highly parallel processing on a
\textbf{Graphics Processing Unit (GPU)} can provide significant
\textbf{speed advantages} over processing on a \textbf{Central
Processing Unit (CPU)} (e.g., when training deep neural networks in the
area of \textbf{deep learning}). To take advantage of this speed
benefit, the AI application must be suitable in terms of
\textbf{parallelizability} of the estimator model used as well as
\textbf{GPU support} through special driver layers, the so-called
\href{https://en.wikipedia.org/wiki/Operating_system_abstraction_layer}{Operating
System Abstraction Layer (OSAL)} (\cite{Wiki_OSAL}).

Such GPUs are installed on powerful \textbf{3D graphics cards}. However,
these must be explicitly qualified for the application for AI - not
every game-suitable graphics card from any manufacturer can be used. The
manufacturer \textbf{Nvidia} offers GPUs suitable for AI in its
high-performance graphics cards with \textbf{CUDA architecture}.
\href{https://en.wikipedia.org/wiki/CUDA}{CUDA} stands for ``Compute
Unified Device Architecture'' and is a \textbf{programming interface}
(API) developed by Nvidia, with which program parts can be processed by
the graphics processor (\cite{Wiki_CUDA}). A GPU with its several tens
of thousands of threads can process highly parallelizable tasks that
require only little data communication between the memory areas
significantly more performantly than conventional CPUs. This speed
advantage can be considerable despite currently available CPU
technologies like \textbf{Multicore} with \textbf{Hyper-Threading} with
Intel CPUs!

Nvidia graphics cards with CUDA-supporting GPUs are ranked based on
their \textbf{\href{https://developer.nvidia.com/cuda-gpus}{compute
capability}} (\cite{NVIDIA_CUDA_CAP_2022}).

However, it should be mentioned that currently only the manufacturer
Nvidia offers 3D graphics cards with CUDA implementation, since CUDA is
a \textbf{proprietary} framework. In addition, there is also the much
less well-known \textbf{open source} alternative
\textbf{\href{https://en.wikipedia.org/wiki/OpenCL}{OpenCL}}, which has
now been implemented by a large number of graphics card manufacturers
(\cite{Wiki_OpenCL}). Since OpenCL is an \textbf{open industry
standard}, Intel and AMD chips and their GPUs, ATI Radeon cards of the
5, 6, 7 and R9 series as well as various Nvidia GeForce cards are
supported, for example.

Regarding the \textbf{code execution performance} of both alternatives
in direct comparison, there are different statements in the technical
literature. The 2011 paper \href{https://arxiv.org/abs/1005.2581}{A
Performance Comparison of CUDA and OpenCL} sees the CUDA implementation
as the clear favorite (\cite{CUDA_OpenCL_Perf_2011}). More recent
publications point out the strong dependence of performance on
\textbf{code quality}, \textbf{algorithm type} and the \textbf{GPU
hardware} used, among other things - see e.g.~here:
\href{https://www.incredibuild.com/blog/cuda-vs-opencl-which-to-use-for-gpu-programming}{CUDA
vs OpenCL: Which to Use for GPU Programming}
(\cite{CUDA_vs_OpenCL_2021}).

It is therefore recommended that the decision for \textbf{CUDA or
OpenCL} should depend on the extent to which most of the applications
employed and the GPU hardware used are better supported by one of the
two approaches in each case.

The \textbf{state of the art} should be also taken into account when
selecting the rest of the training system's hardware. Otherwise,
seemingly (price-wise) inexpensive components could very quickly nullify
the speed advantage of the GPU. In addition to a mainboard suitable for
one (or more) high-performance graphics cards with a correspondingly
powerful BUS system (e.g.~PCI Express), the RAM should be as large as
possible (min. 64 GB) and fast. A large RAM allows, for example, the
\textbf{virtualization} of several parallel systems in the form of
\textbf{\href{https://en.wikipedia.org/wiki/Virtual_machine}{virtual
machines}} and thus a significantly better utilization of the available
computing capacity (\cite{Wiki_VM}). The permanent memory should also be
as large and fast as possible - high-performance \textbf{solid-state
drives (SSDs)} should be clearly preferred over classic hard disks
(HDDs).

    \hypertarget{application-system}{%
\subsubsection{Application system}\label{application-system}}

In the \textbf{application phase} of the trained estimator model,
considerably less computing power and RAM are usually required. If the
concrete application does not require \textbf{continuous learning during
operation}, significantly less expensive systems (in terms of
acquisition costs, power consumption, etc.) can also be used. Such
application-specific \textbf{embedded systems} have only one CPU
(usually in \textbf{ARM architecture}), comparatively limited RAM
(e.g.~1 - 8 GB) and usually no GPU. A popular \textbf{embedded computer}
that is very well supported in terms of ML software is the
\textbf{\href{https://en.wikipedia.org/wiki/Raspberry_Pi}{Raspberry Pi}}
(\cite{Wiki_Raspi}). In addition to its ARM CPU, the Raspberry Pi also
has a GPU installed on the same processor in the so-called
\textbf{System on a Chip design (SoC)}. However, the SoC manufacturer
\textbf{Broadcom} does not support the CUDA API.

There are references in the technical literature that the open source
alternative \textbf{OpenCL} can be installed on the Raspberry Pi and
that the AI framework \textbf{TensorFLow} (see section ``Software'') can
be compiled with
\textbf{\href{https://en.wikipedia.org/wiki/SYCL}{SYCL}} support, where
SYCL stands for ``Single Source OpenCL'' (\cite{Wiki_SYCL}). However, a
first rough review gives the impression that support for this approach
is still very experimental at the moment. Therefore, parallelizing the
AI application on the GPU of the Raspberry Pi does not seem to be an
option (yet). Here are some links for further reading:

\begin{itemize}
\tightlist
\item
  \href{https://qengineering.eu/deep-learning-with-raspberry-pi-and-alternatives.html}{Deep
  learning with Raspberry Pi and alternatives in 2022}
  (\cite{DL_Raspi_2022})
\item
  \href{https://www.hackster.io/news/benchmarking-machine-learning-on-the-new-raspberry-pi-4-model-b-88db9304ce4}{Benchmarking
  Machine Learning on the New Raspberry Pi 4, Model B}
  (\cite{ML_Raspi4_2019})
\item
  \href{https://towardsdatascience.com/portable-computer-vision-tensorflow-2-0-on-a-raspberry-pi-part-1-of-2-84e318798ce9}{Portable
  Computer Vision: TensorFlow 2.0 on a Raspberry Pi}
  (\cite{TF2_Raspi4_2019})
\item
  \href{https://qengineering.eu/install-opencl-on-raspberry-pi-3.html}{Install
  OpenCL on Raspberry Pi 3 B+} (\cite{OpenCL_Raspi_2022})
\item
  \href{https://indiantechwarrior.com/does-tensorflow-support-opencl/}{Does
  TensorFlow Support OpenCL?} (\cite{TF_OpenCL_2022})
\item
  \href{https://www.codeplay.com/portal/blogs/2016/06/01/tensorflow-for-opencl-using-sycl.html}{TensorFlow
  for OpenCL using SYCL} (\cite{TF_OpenCL_SYCL_2016})
\end{itemize}

    \hypertarget{software}{%
\subsection{Software}\label{software}}

    \hypertarget{programming-languages}{%
\subsubsection{Programming languages}\label{programming-languages}}

The comparison of \textbf{advantages and disadvantages} of the various
programming languages and the evaluation of their suitability for ML was
inspired by the following articles, among others:

\begin{itemize}
\tightlist
\item
  \href{https://www.springboard.com/blog/data-science/best-language-for-machine-learning/}{What
  Is the Best Language for Machine Learning?}
  (\cite{ML_bestLanguage_2021})
\item
  \href{https://datasciencenerd.com/is-octave-good-for-machine-learning/}{Is
  Octave Good for Machine Learning?} (\cite{Octave_for_ML_2021})
\end{itemize}

In summary, there is \textbf{no best language for machine learning},
each is good where it fits best.

However, there are definitely some programming languages that are better
suited for machine learning tasks than others
(\cite{ML_bestLanguage_2021}). On the one hand, this is due to whether
the programming language is fundamentally well suited to
\textbf{implement complex mathematical and statistical tasks} in
efficient algorithms.

On the other hand, when deciding for or against a programming language,
it should definitely also be taken into account whether it contains
sufficient \textbf{basic functionalities for data analysis and its
processing}, as well as very diverse \textbf{extension libraries}
(so-called \textbf{packages}) that are well supported by the community
are available. By using these libraries, it is possible to concentrate
on the concrete task when creating an ML application and not have to
constantly solve the same trivial problems anew in every new application
(e.g.~the efficient \textbf{handling of datasets} or the execution of
\textbf{matrices calculations}).

Following trend chart shows how the
\href{https://insights.stackoverflow.com/trends?tags=python\%2Cr\%2Coctave\%2Cjava\%2Cc\%2B\%2B}{popularity
of selected programming languages} suitable for machine learning has
evolved since 2008:

\begin{figure}
\centering
\includegraphics{images/2022-09-07_StackOverflowTrends_ProgrammingLanguages_wide.png}
\caption{Trend chart shows popularity of programming languages for ML
(source:
\href{https://insights.stackoverflow.com/trends?tags=python\%2Cr\%2Coctave\%2Cjava\%2Cc\%2B\%2B}{Stack
Overflow Trends}, license: CC BY-SA 4.0)}
\end{figure}

    \hypertarget{python}{%
\paragraph{\texorpdfstring{\href{https://en.wikipedia.org/wiki/Python_(programming_language)}{Python}}{Python}}\label{python}}

It is a high-level, \textbf{general-purpose} programming language where
its design philosophy emphasizes \textbf{code readability}. The
\textbf{variable types} in Python are \textbf{dynamic} and
\textbf{memory} is \textbf{automatically managed} to create and delete
data objects (see
\href{https://en.wikipedia.org/wiki/Garbage_collection_(computer_science)}{garbage
collection}).

\hypertarget{pros}{%
\subparagraph{Pros:}\label{pros}}

\begin{itemize}
\tightlist
\item
  Python offers simple, concise, and \textbf{readable code} for allowing
  to write robust and reliable programs.
\item
  It lets you focus on solving the ML problem instead of getting lost in
  the language's technical nuances.
\item
  Python has \textbf{extensive libraries for ML},
  e.g.~\texttt{Scikit-learn}, \texttt{Pandas}, \texttt{TensorFlow} or
  \texttt{Keras} have become standard libraries for various ML tasks.
\item
  The language has been around for decades and has developed a large and
  helpful community.
\item
  Besides extensive online documentation, there are thousands of
  question-answers and community guides for various functionalities of
  the language (this is also very well reflected in the trend graph on
  the popularity of programming languages).
\end{itemize}

\hypertarget{cons}{%
\subparagraph{Cons:}\label{cons}}

As far as I know, there are no technical drawbacks when using Python --
of course, this evaluation depends on the particular application.

    \hypertarget{r}{%
\paragraph{\texorpdfstring{\href{https://en.wikipedia.org/wiki/R_(programming_language)}{R}}{R}}\label{r}}

It is a programming language for \textbf{statistical computing} and
\textbf{graphics} supported by the \textbf{R Core Team} and the
\textbf{R Foundation for Statistical Computing}. Created by
statisticians Ross Ihaka and Robert Gentleman, R is used among data
miners, bioinformaticians and statisticians for data analysis and
developing statistical software.

\hypertarget{pros}{%
\subparagraph{Pros:}\label{pros}}

\begin{itemize}
\tightlist
\item
  After Python, R is the recommended ML programming language.
\item
  R is a flexible and cross-platform compatible language.
\item
  It has a growing, supportive community.
\item
  R is well suited for data visualization and \textbf{statistics}, often
  making it the language of choice for applications with a large amount
  of statistical data.
\item
  It is considered a powerful choice for \textbf{machine learning},
  offering a variety of machine learning techniques (e.g., data
  visualization, data sampling, data analysis and supervised and
  unsupervised machine learning models) via post-installable libraries.
\end{itemize}

\hypertarget{cons}{%
\subparagraph{Cons:}\label{cons}}

\begin{itemize}
\tightlist
\item
  R is often reported to be laggier and slower as compared to Python
  when dealing with large-scale datasets.
\item
  It has a \textbf{significantly lower community} support when answering
  questions or giving guidance \textbf{compared to Python} (see trend
  chart on popularity of programming languages).
\item
  The \textbf{learning curve} for the basic entry into R and the
  application in more complex projects for data analysis or machine
  learning is significantly \textbf{steeper than with Python}.
\end{itemize}

    \hypertarget{java}{%
\paragraph{\texorpdfstring{\href{https://en.wikipedia.org/wiki/Java_(programming_language)}{Java}}{Java}}\label{java}}

It is a high-level, \textbf{class-based}, \textbf{object-oriented}
programming language that is designed to have \textbf{as few
implementation dependencies} as possible. It is a
\textbf{general-purpose} programming language intended that compiled
Java code can run on all platforms that support Java without the need to
recompile.

\hypertarget{pros}{%
\subparagraph{Pros:}\label{pros}}

\begin{itemize}
\tightlist
\item
  Using Java for machine learning is especially popular among developers
  with a Java background, as it skips the need to learn another
  programming language such as Python or R.
\item
  Like Python and R, Java also has a variety of third-party machine
  learning libraries, e.g.~\textbf{JavaML} is a built-in library with a
  collection of algorithms implemented in Java for ML.
\item
  Scalability is an important feature for many ML projects, which is
  well supported by Java.
\item
  \href{https://en.wikipedia.org/wiki/Java_virtual_machine}{Java Virtual
  Machine (JVM)} enables the development of ML applications for multiple
  platforms.
\item
  Java is very well suited for speed-critical ML projects.
\end{itemize}

\hypertarget{cons}{%
\subparagraph{Cons:}\label{cons}}

\begin{itemize}
\tightlist
\item
  Java has a much lower community support in answering questions or
  giving guidance compared to Python - but a better one than R (see
  trend chart on popularity of programming languages).
\end{itemize}

    \hypertarget{gnu-octave}{%
\paragraph{\texorpdfstring{\href{https://en.wikipedia.org/wiki/GNU_Octave}{GNU
Octave}}{GNU Octave}}\label{gnu-octave}}

It is a high-level programming language that's \textbf{designed for
numerical computations} (\cite{Octave_for_ML_2021}).

\hypertarget{pros}{%
\subparagraph{Pros:}\label{pros}}

\begin{itemize}
\tightlist
\item
  With Octave, \textbf{linear and non-linear numerical problems} can be
  solved quickly.
\item
  Octave is syntactically very similar to
  \href{https://en.wikipedia.org/wiki/MATLAB}{MATLAB} and mostly
  \textbf{compatible with MATLAB}. If no MATLAB-specific functions are
  used, the program code also runs in Octave. In addition, Octave even
  has some language functions and a syntax diversity that MATLAB lacks.
\end{itemize}

\hypertarget{cons}{%
\subparagraph{Cons:}\label{cons}}

\begin{itemize}
\tightlist
\item
  However, Octave is not a good programming language for machine
  learning in a production environment.
\item
  It doesn't have the same functionality as other languages used for ML,
  due to \textbf{missing libraries} and frameworks to speed up ML tasks.
\item
  It's not as flexible, simple, and feature-rich as other programming
  languages.
\item
  Compared to Python, R and Java, Octave has almost \textbf{no community
  support} when it comes to answering questions or providing guidance
  (compare trend chart on popularity of programming languages).
\end{itemize}

    \hypertarget{python-packages}{%
\subsubsection{Python packages}\label{python-packages}}

The \textbf{mathematics} and the \textbf{numerical implementation} of
various algorithms for data analysis and machine learning are usually
\textbf{very complex} and often only comprehensible for ML experts
(\cite{ML_bestLanguage_2021}). For a broad and praxis-oriented
\textbf{usability}, better \textbf{reusability of code} and a successful
\textbf{integration} into a concrete ML application, the functional
relationships should be \textbf{encapsulated in libraries} (so-called
``packages'').

From the user's point of view, when selecting libraries for the
respective task, attention should be paid not only to functionality but
also to the \textbf{comprehensibility of the user interface supported by
good documentation}. Furthermore, the \textbf{size of the community}
behind the library, consisting of active developers as well as technical
experts for supporting the users in the event of questions or problems
arising, should be decisive in the selection.

Following trend chart shows how the
\href{https://insights.stackoverflow.com/trends?tags=pandas\%2Cnumpy\%2Cmatplotlib\%2Cseaborn\%2Cscikit-learn\%2Ctensorflow\%2Ckeras\%2Cpytorch}{popularity
of selected python packages} suitable for \textbf{data analysis},
\textbf{data visualization} and \textbf{machine learning} has evolved
since 2008:

\begin{figure}
\centering
\includegraphics{images/2022-11-11_StackOverflowTrends_MLPythonPackages_wide.png}
\caption{Trend chart shows popularity of selected python packages for
data analysis, data visualization and machine learning (source:
\href{https://insights.stackoverflow.com/trends?tags=pandas\%2Cnumpy\%2Cmatplotlib\%2Cseaborn\%2Cscikit-learn\%2Ctensorflow\%2Ckeras\%2Cpytorch}{Stack
Overflow Trends}, license: CC BY-SA 4.0)}
\end{figure}

In the scientific research and systematic improvement of ML algorithms,
a very dynamic progression can be observed in recent years. The latest
scientific findings are regularly compared with each other in
\textbf{``Machine Learning Competitions''} using known and
\textbf{freely available datasets} (see benchmarking competitions of ML
algorithms on platforms such as
\url{https://www.kaggle.com/competitions}). At the same time, the
corresponding ML libraries are revised, extended and made available to
general users by the scientific community. Therefore, this
\textbf{scientific transfer} ideally takes place in the context of
\textbf{open source developments}.

Due to the superior advantages of \textbf{Python} (see previous
section), a selection of \textbf{open source} packages available for
this programming language usable for \textbf{data analysis},
\textbf{data visualization} and \textbf{machine learning} are presented
in this section.

    \hypertarget{data-analysis}{%
\paragraph{Data analysis}\label{data-analysis}}

\begin{itemize}
\item
  \href{https://numpy.org/devdocs/user/whatisnumpy.html}{NumPy} is a
  Python library that provides a \textbf{multidimensional array object},
  various derived objects (such as masked arrays and \textbf{matrices}),
  and an assortment of routines for \textbf{fast operations on arrays},
  including mathematical, logical, shape manipulation, sorting,
  selecting, discrete Fourier transformations, basic linear algebra,
  basic statistical operations, random simulation and much more.
\item
  \href{https://pandas.pydata.org/docs/getting_started/overview.html}{Pandas}
  is a Python package providing fast, flexible and expressive data
  structures designed to work with \textbf{relational} or
  \textbf{labeled} datasets. It provides two primary data structures:
  \texttt{pandas.Series} (1-dimensional time series) and
  \texttt{pandas.DataFrame} (2-dimensional spreadsheets). The data
  structure \texttt{pandas.DataFrame} offers the same functionality as
  the structure \texttt{data.frame} known from the programming language
  R and much more.
\end{itemize}

    \hypertarget{data-visualization}{%
\paragraph{Data visualization}\label{data-visualization}}

\begin{itemize}
\item
  \href{https://matplotlib.org}{Matplotlib} is a library for making
  \textbf{2D plots of arrays} in Python. Although it has its origins in
  \textbf{emulating the MATLAB graphics commands}, it is independent of
  MATLAB, and can be used in a Pythonic, object oriented way. Although
  Matplotlib is written primarily in pure Python, it makes heavy use of
  NumPy and other extension code to provide good performance even for
  large arrays (\cite{Hunter_matplotlib_2007}).
\item
  \href{https://seaborn.pydata.org/}{Seaborn} is a library for making
  \textbf{statistical graphics} in Python. It builds \textbf{on top of
  matplotlib} and integrates closely with \textbf{pandas data
  structures}. Seaborn helps to explore and understand the data. Its
  plotting functions operate on \textbf{dataframes} and \textbf{arrays}
  containing whole datasets and internally perform the necessary
  semantic mapping and statistical aggregation to produce informative
  plots (\cite{Waskom_seaborn_2021}).
\end{itemize}

    \hypertarget{machine-learning}{%
\paragraph{Machine learning}\label{machine-learning}}

\begin{itemize}
\item
  \href{https://scikit-learn.org/stable/}{Scikit-Learn} is a
  \textbf{free software machine learning library} for \textbf{Python}.
  It features various \textbf{classification}, \textbf{regression} and
  \textbf{clustering} algorithms including \textbf{support-vector
  machines}, \textbf{random forests}, \textbf{gradient boosting} and
  \textbf{k-means}. It is designed to interoperate with the Python
  numerical and scientific libraries \textbf{NumPy} and \textbf{SciPy}.
  Scikit-Learn will be used in the next steps of this of this getting
  started tutorial.
\item
  \href{https://www.tensorflow.org}{TensorFlow} offers, among other
  things, the possibility to create and train \textbf{artificial neural
  networks (ANN)} based on \textbf{Google AI}. it is an open source
  software library for \textbf{machine learning} and \textbf{artificial
  intelligence}. It can be used across a range of tasks but has a
  particular focus on training and inference of \textbf{deep neural
  networks}. However, the installation and usage is very much beyond the
  scope of this beginner tutorial.
\item
  \href{https://keras.io/about/}{Keras} is an open source software
  library for \textbf{deep learning} that provides a Python interface
  for \textbf{ANNs}. Keras acts as an \textbf{general interface} for
  several \textbf{backends}, such as \textbf{TensorFlow},
  \textbf{Microsoft Cognitive Toolkit} and \textbf{Theano}. Keras will
  also not be used in this beginner tutorial.
\item
  \href{https://pytorch.org/docs/stable/index.html}{PyTorch} is an
  optimized open source tensor library for deep learning using GPUs and
  CPUs. It can be used to create \textbf{Tensor analyses} accelerated by
  \textbf{GPUs} as well as \textbf{Neural Networks} based on an Autograd
  system. Proven Python libraries such as \textbf{NumPy}, \textbf{SciPy}
  and \textbf{Cython} can be used. In Deep Learning, the program library
  is characterized by a lot of \textbf{flexibility} and a \textbf{high
  speed}. However, PyTorch will also not be used in this beginner
  tutorial.
\end{itemize}

    \hypertarget{import-python-packages-globally}{%
\subsubsection{Import Python packages
globally}\label{import-python-packages-globally}}

The aim of this section is to import globally used Python packages for
data analysis and ML, such as \texttt{Pandas}, \texttt{NumPY},
\texttt{matplotlib} and \texttt{Scikit-Learn}.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{1}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{k+kn}{import} \PY{n+nn}{time}

\PY{k+kn}{from} \PY{n+nn}{IPython}\PY{n+nn}{.}\PY{n+nn}{display} \PY{k+kn}{import} \PY{n}{display}\PY{p}{,} \PY{n}{Markdown}

\PY{k+kn}{import} \PY{n+nn}{pandas} \PY{k}{as} \PY{n+nn}{pd}
\PY{k+kn}{import} \PY{n+nn}{numpy} \PY{k}{as} \PY{n+nn}{np}
\PY{k+kn}{import} \PY{n+nn}{matplotlib}\PY{n+nn}{.}\PY{n+nn}{pyplot} \PY{k}{as} \PY{n+nn}{plt}
\PY{k+kn}{from} \PY{n+nn}{sklearn} \PY{k+kn}{import} \PY{n}{svm}\PY{p}{,} \PY{n}{metrics}
\PY{k+kn}{import} \PY{n+nn}{seaborn} \PY{k}{as} \PY{n+nn}{sns}
\PY{o}{\PYZpc{}}\PY{k}{matplotlib} inline

\PY{c+c1}{\PYZsh{} Set font sizes of figure title, axes and labels }
\PY{c+c1}{\PYZsh{} globally via a rcParams dictionary}
\PY{k+kn}{import} \PY{n+nn}{matplotlib}\PY{n+nn}{.}\PY{n+nn}{pylab} \PY{k}{as} \PY{n+nn}{pylab}
\PY{n}{params} \PY{o}{=} \PY{p}{\PYZob{}}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{legend.fontsize}\PY{l+s+s1}{\PYZsq{}}\PY{p}{:} \PY{l+m+mi}{12}\PY{p}{,}
         \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{axes.labelsize}\PY{l+s+s1}{\PYZsq{}}\PY{p}{:}   \PY{l+m+mi}{12}\PY{p}{,}
         \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{axes.titlesize}\PY{l+s+s1}{\PYZsq{}}\PY{p}{:}   \PY{l+m+mi}{14}\PY{p}{,}
         \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{xtick.labelsize}\PY{l+s+s1}{\PYZsq{}}\PY{p}{:}  \PY{l+m+mi}{12}\PY{p}{,}
         \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{ytick.labelsize}\PY{l+s+s1}{\PYZsq{}}\PY{p}{:}  \PY{l+m+mi}{12}\PY{p}{,}
         \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{axes.edgecolor}\PY{l+s+s1}{\PYZsq{}}\PY{p}{:}   \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{\PYZsh{}000000}\PY{l+s+s1}{\PYZsq{}}\PY{p}{\PYZcb{}}
\PY{n}{pylab}\PY{o}{.}\PY{n}{rcParams}\PY{o}{.}\PY{n}{update}\PY{p}{(}\PY{n}{params}\PY{p}{)}

\PY{c+c1}{\PYZsh{} Function to render dataframes to markdown table with caption}
\PY{k}{def} \PY{n+nf}{func\PYZus{}render\PYZus{}dataframe2Markdown}\PY{p}{(}\PY{n}{df}\PY{p}{,} \PY{n}{str\PYZus{}caption}\PY{p}{)}\PY{p}{:}
    \PY{n}{str\PYZus{}table\PYZus{}complete} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Table: }\PY{l+s+s1}{\PYZsq{}} \PY{o}{+} \PY{n}{str\PYZus{}caption} \PY{o}{+} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+se}{\PYZbs{}n}\PY{l+s+se}{\PYZbs{}n}\PY{l+s+s1}{\PYZsq{}} \PYZbs{}
                         \PY{o}{+} \PY{n}{df}\PY{o}{.}\PY{n}{to\PYZus{}markdown}\PY{p}{(}\PY{n}{index}\PY{o}{=}\PY{k+kc}{True}\PY{p}{)}
    \PY{n}{display}\PY{p}{(}\PY{n}{Markdown}\PY{p}{(}\PY{n}{str\PYZus{}table\PYZus{}complete}\PY{p}{)}\PY{p}{)}

\PY{c+c1}{\PYZsh{} Function to extend Pandas API to head and tail simultaneously}
\PY{k}{def} \PY{n+nf}{head\PYZus{}tail}\PY{p}{(}\PY{n}{df}\PY{p}{,} \PY{n}{rows}\PY{o}{=}\PY{l+m+mi}{5}\PY{p}{)}\PY{p}{:}
    \PY{k}{return} \PY{n}{pd}\PY{o}{.}\PY{n}{concat}\PY{p}{(}\PY{p}{[}\PY{n}{df}\PY{o}{.}\PY{n}{head}\PY{p}{(}\PY{n}{rows}\PY{p}{)}\PY{p}{,} \PY{n}{df}\PY{o}{.}\PY{n}{tail}\PY{p}{(}\PY{n}{rows}\PY{p}{)}\PY{p}{]}\PY{p}{,} \PY{n}{axis}\PY{o}{=}\PY{l+m+mi}{0}\PY{p}{)}
\PY{n+nb}{setattr}\PY{p}{(}\PY{n}{pd}\PY{o}{.}\PY{n}{DataFrame}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{head\PYZus{}tail}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{head\PYZus{}tail}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \hypertarget{programming-ides}{%
\subsubsection{Programming IDEs}\label{programming-ides}}

\textbf{\href{https://en.wikipedia.org/wiki/Integrated_development_environment}{Integrated
development environments (IDE)}} are software applications that provide
comprehensive features to computer programmers for \textbf{software
development}. An IDE typically consists of a \textbf{source code
editor}, automated \textbf{build tools} for compiling or an
\textbf{interpreter} for scripting languages, a front end to the
\textbf{version control system} like
e.g.~\href{https://en.wikipedia.org/wiki/Git}{Git} and a
\textbf{debugger} (\cite{Wiki_IDE}).

Following trend chart shows how the
\href{https://insights.stackoverflow.com/trends?tags=rstudio\%2Cjupyter-notebook\%2Cvisual-studio-code\%2Cpycharm}{popularity
of selected IDEs} suitable for ML programming languages has evolved
since 2008:

\begin{figure}
\centering
\includegraphics{images/2022-11-11_StackOverflowTrends_IDEs_wide.png}
\caption{Trend chart shows popularity of selected IDEs for ML
programming languages (source:
\href{https://insights.stackoverflow.com/trends?tags=rstudio\%2Cjupyter-notebook\%2Cvisual-studio-code\%2Cpycharm}{Stack
Overflow Trends}, license: CC BY-SA 4.0)}
\end{figure}

    \hypertarget{visual-studio-code-vsc}{%
\paragraph{\texorpdfstring{\href{https://en.wikipedia.org/wiki/Visual_Studio_Code}{Visual
Studio Code
(VSC)}}{Visual Studio Code (VSC)}}\label{visual-studio-code-vsc}}

\begin{figure}
\centering
\includegraphics{images/Screenshot_VSC.png}
\caption{Screenshot of IDE \emph{Visual Studio Code} (source: Kasper,
license: CC BY-SA 4.0)}
\end{figure}

It is an IDE made by \textbf{Microsoft} for \textbf{Windows},
\textbf{Linux} and \textbf{macOS}. Features include support for
\textbf{debugging}, \textbf{syntax highlighting} for many different
programming languages, intelligent \textbf{code completion} and embedded
\textbf{version control system} Git. Users can change the theme,
keyboard shortcuts, preferences, and install \textbf{extensions} from a
huge repository that add additional functionality. Despite of its
platform independence, VSC is \textbf{not open source} - in fact it is
released under a traditional
\href{https://code.visualstudio.com/License/}{Microsoft product
license}.

    \hypertarget{jupyterlab}{%
\paragraph{\texorpdfstring{\href{https://en.wikipedia.org/wiki/Project_Jupyter\#Jupyter_Notebook}{JupyterLab}}{JupyterLab}}\label{jupyterlab}}

\begin{figure}
\centering
\includegraphics{images/Screenshot_JupyterLab.png}
\caption{Screenshot of IDE \emph{JupyterLab} (source: Kasper, license:
CC BY-SA 4.0)}
\end{figure}

It is the successor product for the web-based interactive environment
\textbf{Jupyter Notebook}. Within this IDE, Jupyter Notebook documents
can be created, edited, and executed interactively. The notebooks
consist of \textbf{input and output cells}, each of which can contain
program code, formatted text in \textbf{Markdown} format and live plots
generated from the code.

Jupyter is a new \textbf{open source} alternative to the proprietary
numerical software
\href{https://en.wikipedia.org/wiki/Wolfram_Mathematica}{Mathematica}
from \textbf{Wolfram Research} that is well on the way to becoming a
\textbf{standard for exchanging research results}
(\cite{Scientific_Paper_obsolete_2018};
\cite{Future_of_Research_Paper_2018}).

Originally Jupyter was intended as an IDE for the programming languages
\textbf{Julia} and \textbf{Python}. Besides that it is also possible to
install other interpreter kernels, such as the
\textbf{\href{https://irkernel.github.io/installation/}{IRkernel}} for
R. This can be interesting if the IDE \textbf{RStudio Desktop} is not
available on the target platform used. For example, it is very difficult
to install RStudio on the ARM-based embedded computer \textbf{Raspberry
Pi} due to many technical dependencies. In contrast, using the R kernel
in JupyterLab on the Raspberry Pi works very well and performant.

    \hypertarget{pycharm}{%
\paragraph{\texorpdfstring{\href{https://de.wikipedia.org/wiki/PyCharm}{PyCharm}}{PyCharm}}\label{pycharm}}

\begin{figure}
\centering
\includegraphics{images/Screenshot_PyCharm.png}
\caption{Screenshot of IDE \emph{PyCharm} (source: Kasper, license: CC
BY-SA 4.0)}
\end{figure}

It is an IDE of the company \textbf{JetBrains} for the programming
language \textbf{Python}. In addition to a \textbf{fee-based
professional version}, there is also a free, \textbf{open source
community version}. With the latter, pure Python projects can be created
and edited. The professional version offers support for other
programming languages such as HTML, JavaScript and SQL.

Furthermore, tools for version control (e.g.~\textbf{Git}) as well as
various possibilities for \textbf{automatic creation} and
\textbf{completion of code} are included.

    \hypertarget{rstudio}{%
\paragraph{\texorpdfstring{\href{https://en.wikipedia.org/wiki/RStudio}{RStudio}}{RStudio}}\label{rstudio}}

\begin{figure}
\centering
\includegraphics{images/Screenshot_RStudio.png}
\caption{Screenshot of IDE \emph{RStudio} (source: Kasper, license: CC
BY-SA 4.0)}
\end{figure}

It is an IDE and graphical user interface for the statistical
programming language \textbf{R} offered by \textbf{RStudio, Inc.} and is
made available in two formats. \textbf{RStudio Desktop} is a regular
desktop application while \textbf{RStudio Server} runs on a remote
server and allows accessing RStudio using a web browser. Both software
products are available in \textbf{open source} and \textbf{commercial}
versions, each with different functionalities.

The program editor in RStudio allows \textbf{autocompletion},
\textbf{automatic indentation}, \textbf{syntax highlighting},
\textbf{code folding} as well as \textbf{integrated help} and
information about functions and objects in the working environment.
There is the ability to view and edit the contents of variables and
datasets. To facilitate collaboration, scripts, data and other files can
be combined into projects (.Rproj) and versioned with \textbf{Git}.

    \hypertarget{gnu-octave-gui}{%
\paragraph{\texorpdfstring{\href{https://en.wikipedia.org/wiki/GNU_Octave\#User_interfaces}{GNU
Octave (GUI)}}{GNU Octave (GUI)}}\label{gnu-octave-gui}}

\begin{figure}
\centering
\includegraphics{images/Screenshot_GNU_Octave.png}
\caption{Screenshot of IDE \emph{GNU Octave} (source: Kasper, license:
CC BY-SA 4.0)}
\end{figure}

It is the official graphical user interface for the \textbf{GNU Octave}
programming language and is available for Windows, macOS, Linux and BSD
under \textbf{Open Source} licensing.

If the command line interpreter (CLI) starts instead of the graphical
user interface (GUI) when \texttt{octave} is called, this can be forced
via the \texttt{octave\ -\/-gui} option.

    \hypertarget{cloud-hosted-ides}{%
\subsubsection{Cloud-hosted IDEs}\label{cloud-hosted-ides}}

A very interesting alternative to own, local and for the ML application
adequately powerful and thus price-intensive hardware resources can be
\textbf{cloud-hosted Jupyter environments}. These offer features such as
cloud storage, model training and deployment capabilities, version
control and much more.

Since the entire hardware and backend configurations are hosted in the
cloud by the various providers, the user can concentrate on creating his
ML application. The cloud provider takes care of purchasing the hardware
and the sometimes time-consuming installation and configuration of the
programming environment (\cite{Colab_Alternatives_2021}).

The cloud environments briefly presented here can be used freely after
registration - on condition that own projects remain accessible to other
researchers. Even in the free variant, GPUs and
\href{https://en.wikipedia.org/wiki/Tensor_Processing_Unit}{Tensor
Processing Units (TPUs)} can be selected in the project for hardware
acceleration. This is particularly interesting for training deep neural
networks.

In the premium versions, for example, more powerful GPUs and TPUs as
well as more memory can be accessed. Additionally, there is the option
to keep the projects private and thus prevent accessibility for other
researchers.

However, with all the advantages, \textbf{data protection aspects}
should definitely be considered. Before using a cloud environment, it
should be clarified whether and to what extent, for example, \textbf{own
datasets with personal data} may be uploaded to the cloud projects. If
there are uncertainties here, local and self-hosted ML resources should
be used in any case!

    \hypertarget{google-colaboratory}{%
\paragraph{\texorpdfstring{\href{https://colab.research.google.com/}{Google
Colaboratory}}{Google Colaboratory}}\label{google-colaboratory}}

\begin{figure}
\centering
\includegraphics{images/Screenshot_google_Colab.png}
\caption{Screenshot of IDE \emph{Google Colaboratory} (source: Kasper,
license: CC BY-SA 4.0)}
\end{figure}

In recent years, \textbf{Google Colaboratory} (\textbf{Colab} for short)
has become a popular choice for cloud-based Jupyter notebooks. Thanks to
its free-to-use GPUs and cloud storage linked to Google Drive, it is
used by many researchers in the ML and data science community
(\cite{Colab_Alternatives_2021}).

Due to the similarity of the web interface to Jupyter, Python developers
can write and run arbitrary Python program codes. Colab is a
cloud-hosted version of Jupyter Notebook that provides free access to
compute infrastructure such as memory, storage, processing capacity,
GPUs and TPUs (\cite{Colab_about_2022}).

Furthermore, commonly used libraries such as \textbf{PyTorch},
\textbf{TensorFlow} and \textbf{Keras} can be used to develop deep
learning applications (\cite{Colab_5_Alternatives_2021}).

    \hypertarget{google-kaggle}{%
\paragraph{\texorpdfstring{\href{https://www.kaggle.com}{Google
Kaggle}}{Google Kaggle}}\label{google-kaggle}}

\begin{figure}
\centering
\includegraphics{images/Screenshot_google_Kaggle.png}
\caption{Screenshot of IDE \emph{Google Kaggle} (source: Kasper,
license: CC BY-SA 4.0)}
\end{figure}

This is another Google product with similar functionality to Colab. Like
Colab, \textbf{Kaggle} also offers free browser-based Jupyter notebooks
and the use of GPUs. Kaggle also has many \textbf{Python packages
pre-installed}, which lowers the barrier to entry for many users
(\cite{Colab_Alternatives_2021}).

Kaggle and Colab have a number of similarities - among other things,
most of the keyboard shortcuts are the same as in Jupyter notebooks.
Furthermore, many datasets can be imported. Kaggle has a large user
community to learn and improve data science skills
(\cite{Colab_5_Alternatives_2021}).

    \hypertarget{paperspace-gradient}{%
\paragraph{\texorpdfstring{\href{https://www.paperspace.com/gradient/notebooks}{Paperspace
Gradient}}{Paperspace Gradient}}\label{paperspace-gradient}}

\begin{figure}
\centering
\includegraphics{images/Screenshot_Paperspace_Gradient.png}
\caption{Screenshot of IDE \emph{Paperspace Gradient} (source: Kasper,
license: CC BY-SA 4.0)}
\end{figure}

Unlike Colab, \textbf{Paperspace Gradient} can implement entire
\textbf{ML workflows} from data pre-processing to training models to
deploying the trained models. Furthermore, Gradient has features like a
CLI tool, more control over the GPU and simpler data management
services. Due to the variety of functions, one must first become
familiar with the operation of the significantly more complex user
interface (\cite{Free_GPUs_for_ML_2020}).

    \hypertarget{operating-systems}{%
\subsubsection{Operating systems}\label{operating-systems}}

The \textbf{programming languages}, \textbf{Python libraries} and
\textbf{development environments} presented in the previous sections are
available for different operating systems, such as \textbf{Linux},
\textbf{Windows} and \textbf{macOS}. Therefore, the decision for or
against an operating system may \textbf{depend on the technical
background} of the ML developer.

Nevertheless, the following general \textbf{requirements} can be
specified for an operating system \textbf{suitable for software
development}:

\begin{itemize}
\tightlist
\item
  \textbf{Openness}: availability of very good interface documentation
  and ideally open source software
\item
  \textbf{Self-administration}: user has full installation and
  configuration rights
\item
  \textbf{Communication capability}: unfiltered and bidirectional
  communication in the local network as well as to the internet on all
  necessary protocols possible
\item
  \textbf{Extensibility}:

  \begin{itemize}
  \tightlist
  \item
    automated software installation and update management via central
    package management systems such as \texttt{apt}, \texttt{pip} or
    \texttt{conda}
  \item
    possible integration of additional software libraries or external
    sensor hardware
  \end{itemize}
\end{itemize}

Following trend chart shows how the
\href{https://insights.stackoverflow.com/trends?tags=windows\%2Clinux\%2Cmacos}{popularity
of selected operating systems} used by \textbf{data analysts} and
\textbf{ML developers} has evolved since 2008:

\begin{figure}
\centering
\includegraphics{images/2022-09-07_StackOverflowTrends_OperatingSystems_wide.png}
\caption{Trend chart shows popularity of selected operating systems used
by \textbf{data analysts} and \textbf{ML developers} (source:
\href{https://insights.stackoverflow.com/trends?tags=windows\%2Clinux\%2Cmacos}{Stack
Overflow Trends}, license: CC BY-SA 4.0)}
\end{figure}

For \textbf{security} reasons, the \textbf{IT departments} of many
employers massively \textbf{restrict installation and configuration
rights}. Furthermore, very restrictive firewall settings severely
\textbf{restrict} unfiltered and bidirectional \textbf{communication} in
the local network and to the internet. Automated \textbf{software
installations} via package managers are often \textbf{not possible} or
only possible with difficulty due to blocked protocols.

To deal with these challenges, two possible solutions are presented
below.

    \hypertarget{virtual-machine}{%
\paragraph{Virtual machine}\label{virtual-machine}}

To be able to install, configure and update the required software (IDEs,
programming languages and ML packages) independently, the use of a
\href{https://en.wikipedia.org/wiki/Virtual_machine}{Virtual Machine
(VM)} could be a possible alternative.

However, there are also significant disadvantages here:

\begin{itemize}
\tightlist
\item
  The \textbf{communication problem} is \textbf{not solved}, because the
  VM shares the access to the internet with the host system.
\item
  The \textbf{access to 3D graphics cards} is usually \textbf{not
  possible} due to virtualization.
\item
  This solution has only \textbf{low application performance}, as
  regular business computers are often only very sparsely equipped in
  terms of RAM and processor performance for cost reasons.
\end{itemize}

    \hypertarget{separate-lab-computer}{%
\paragraph{Separate lab computer}\label{separate-lab-computer}}

All the problems mentioned in the previous section can only be solved
satisfactorily by acquiring a \textbf{separate laboratory computer} with
\textbf{its own internet access} (e.g.~via an \textbf{LTE-capable wifi
router}).

This laboratory computer can be configured according to your own
requirements, depending on the available budget in terms of hardware and
software.

However, it should be noted here that the \textbf{IT departments} of
many employers do \textbf{not offer any support} for this solution. You
are usually responsible for software installation, maintenance and
troubleshooting yourself!

    \hypertarget{step-1-acquire-the-ml-dataset}{%
\section{STEP 1: Acquire the ML
dataset}\label{step-1-acquire-the-ml-dataset}}

To allow an ML novice to first familiarize themselves with the ML
algorithms, tools, libraries, and programming systems, the ready-made
and very beginner-friendly \textbf{Iris dataset} is involved in this
step. Only after a comprehensive acquaintance with the application of ML
tools would it make sense to examine one's own environment for
ML-suitable applications and to obtain suitable datasets from them.
However, this is beyond the scope of this introductory tutorial.

Several details, for example, on the history of the creation of the
\href{https://en.wikipedia.org/wiki/Iris_flower_data_set}{Iris flower
dataset} can be found e.g.~on Wikipedia (see \cite{Wiki_IrisDS}).

It can be downloaded on
\href{https://www.kaggle.com/datasets/arshid/iris-flower-dataset}{Kaggle:
Iris Flower Dataset} (\cite{Kaggle_IrisDS}). Furthermore, the dataset is
available via Python in the machine learning package
\href{https://scikit-learn.org}{Scikit-learn}, so that users can access
it without having to find a special source for it.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{2}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Import Iris dataset for exploration}
\PY{n}{irisdata\PYZus{}df} \PY{o}{=} \PY{n}{pd}\PY{o}{.}\PY{n}{read\PYZus{}csv}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{./datasets/IRIS\PYZus{}flower\PYZus{}dataset\PYZus{}kaggle.csv}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \hypertarget{step-2-explore-the-ml-dataset}{%
\section{STEP 2: Explore the ML
dataset}\label{step-2-explore-the-ml-dataset}}

One of the most important steps in the entire ML process is this step,
in which the dataset included in Step 1 is examined using typical data
analysis tools. In addition to exploring the \textbf{data structure} and
\textbf{internal correlations} in the dataset, errors such as
\textbf{gaps}, \textbf{duplications}, or obvious \textbf{misentries}
must also be found and corrected where possible. This is enormously
important so that the classification can later provide plausible
results.

\hypertarget{goals-of-exploration}{%
\subsection{Goals of exploration}\label{goals-of-exploration}}

The objectives of the exploration of the dataset are as follows:

\begin{enumerate}
\def\labelenumi{\arabic{enumi}.}
\tightlist
\item
  Clarify the \textbf{origins history}:

  \begin{itemize}
  \tightlist
  \item
    Where did the data come from? \textbf{→} Contact persons and
    licensing permissions?
  \item
    Who obtained the data and with which (measurement) methods?
    \textbf{→} Did systematic errors occur during the acquisition?
  \item
    What were they originally intended for? \textbf{→} Can they be used
    for my application?
  \end{itemize}
\item
  Overview of the internal \textbf{structure and organisation} of the
  data:

  \begin{itemize}
  \tightlist
  \item
    Which columns are there? \textbf{→} With which methods can they be
    read in (e.g.~import of CSV files)?
  \item
    What do they contain for (physical) measured variables? \textbf{→}
    Which technical or physical correlations exist?
  \item
    Which data formats or types are there? \textbf{→} Do they have to be
    converted?
  \item
    In which value ranges do the measurement data vary? \textbf{→} Are
    normalizations necessary?
  \end{itemize}
\item
  Identify \textbf{anomalies} in the dataset:

  \begin{itemize}
  \tightlist
  \item
    Do the data have \textbf{gaps} or \textbf{duplicates}? \textbf{→}
    Does the dataset needs to be cleaned?
  \item
    Are there obvious erroneous entries or measurement outliers?
    \textbf{→} Does (statistical) filtering have to be carried out?
  \end{itemize}
\item
  Avoidance of \textbf{tendencies due to bias}:

  \begin{itemize}
  \tightlist
  \item
    Are all possible classes included in the dataset and equally
    distributed? \textbf{→} Does the dataset need to be enriched with
    additional data for balance?
  \end{itemize}
\item
  Find a first rough \textbf{idea of which correlations} could be in the
  dataset
\end{enumerate}

    \hypertarget{clarify-the-origins-history}{%
\subsection{\texorpdfstring{Clarify the \textbf{origins
history}}{Clarify the origins history}}\label{clarify-the-origins-history}}

\begin{quote}
The
\textbf{\href{https://en.wikipedia.org/wiki/Iris_flower_data_set}{Iris
flower datasets}} is a multivariate dataset introduced by the British
statistician and biologist \emph{Ronald Fisher} in his paper
``\href{https://onlinelibrary.wiley.com/doi/10.1111/j.1469-1809.1936.tb02137.x}{The
use of multiple measurements in taxonomic problems}''
(\cite{Fisher_1936}). It is sometimes called \emph{Anderson's Iris
dataset} because Edgar Anderson collected the data to quantify the
morphologic variation of Iris flowers of three related species
(\cite{Wiki_IrisDS}).
\end{quote}

The dataset is published in Public Domain with a
\href{https://creativecommons.org/share-your-work/public-domain/cc0/}{CC0-License}.

This dataset became a typical test case for many statistical
classification techniques in machine learning such as \textbf{support
vector machines}.

\begin{quote}
{[}..{]} measurements of the flowers of fifty plants each of the two
species \emph{Iris setosa} and \emph{I. versicolor}, found
\textbf{growing together in the same colony} and measured by Dr E.
Anderson (\cite{Fisher_1936})
\end{quote}

\begin{quote}
{[}..{]} \emph{Iris virginica}, differs from the two other samples in
\textbf{not being taken from the same natural colony} (ibidem)
\end{quote}

    \hypertarget{overview-of-the-internal-structure-and-organization-of-the-data}{%
\subsection{\texorpdfstring{Overview of the internal \textbf{structure
and organization} of the
data}{Overview of the internal structure and organization of the data}}\label{overview-of-the-internal-structure-and-organization-of-the-data}}

The dataset consists of 50 samples from each of three species of Iris:
\href{https://en.wikipedia.org/wiki/Iris_setosa}{\emph{Iris setosa}},
\href{https://en.wikipedia.org/wiki/Iris_virginica}{\emph{Iris
virginica}} and
\href{https://en.wikipedia.org/wiki/Iris_versicolor}{\emph{Iris
versicolor}}, so there are 150 samples in total
(\cite{Wiki_Iris_setosa}, \cite{Wiki_Iris_virginica} and
\cite{Wiki_Iris_versicolor}).

Four features were measured from each sample: the length and the width
of the \textbf{\href{https://en.wikipedia.org/wiki/Sepal}{sepals}} and
\textbf{\href{https://en.wikipedia.org/wiki/Petal}{petals}}, in
centimetres (\cite{Wiki_Sepal} and \cite{Wiki_Petal}). Here you can see
a principle illustration of a flower in which, among other things, the
sepals and petals are shown:

    \begin{figure}
\centering
\includegraphics{images/Mature_flower_diagram_1024px.png}
\caption{Principle illustration of a flower with sepal and petal
(source:
\href{https://en.wikipedia.org/wiki/File:Mature_flower_diagram.svg}{Mature\_flower\_diagram.svg},
license: public domain)}
\end{figure}

    Here are pictures of the three different Iris species (\emph{Iris
setosa}, \emph{Iris virginica} and \emph{Iris versicolor}). Given the
dimensions of the flower, it will be possible to predict the class of
the flower.

    \begin{figure}
\centering
\includegraphics{images/Iris_images.png}
\caption{Left: \emph{Iris setosa} (source:
\href{https://commons.wikimedia.org/wiki/File:Irissetosa1.jpg}{Irissetosa1.jpg},
license: public domain); middle: \emph{Iris versicolor} (source:
\href{https://en.wikipedia.org/wiki/File:Iris_versicolor_3.jpg}{Iris\_versicolor\_3.jpg},
license: CC SA 3.0); right: \emph{Iris virginica} (source:
\href{https://en.wikipedia.org/wiki/File:Iris_virginica.jpg}{Iris\_virginica.jpg},
license: CC SA 2.0)}
\end{figure}

    \hypertarget{inspect-structure-of-dataframe}{%
\subsubsection{\texorpdfstring{Inspect \textbf{structure of
dataframe}}{Inspect structure of dataframe}}\label{inspect-structure-of-dataframe}}

    While printing a dataframe (without rendering) - only an abbreviated
view of the dataframe is shown :(\\
Default setting in the \texttt{Pandas} library makes it to display only
5 lines from head and from tail.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{3}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n+nb}{print}\PY{p}{(}\PY{n}{irisdata\PYZus{}df}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{Verbatim}[commandchars=\\\{\}]
     sepal\_length  sepal\_width  petal\_length  petal\_width         species
0             5.1          3.5           1.4          0.2     Iris-setosa
1             4.9          3.0           1.4          0.2     Iris-setosa
2             4.7          3.2           1.3          0.2     Iris-setosa
3             4.6          3.1           1.5          0.2     Iris-setosa
4             5.0          3.6           1.4          0.2     Iris-setosa
..            {\ldots}          {\ldots}           {\ldots}          {\ldots}             {\ldots}
145           6.7          3.0           5.2          2.3  Iris-virginica
146           6.3          2.5           5.0          1.9  Iris-virginica
147           6.5          3.0           5.2          2.0  Iris-virginica
148           6.2          3.4           5.4          2.3  Iris-virginica
149           5.9          3.0           5.1          1.8  Iris-virginica

[150 rows x 5 columns]
    \end{Verbatim}

    To print all rows of a dataframe, the option \texttt{display.max\_rows}
has to set to \texttt{None} in \texttt{Pandas} options.

Alternatively, if the dataframe is rendered with a function from the
\texttt{IPython.display} package, all rows of the dataframe are
displayed by default. \textbf{Rendering dataframes as markdown tables}
is handled here by the custom function
\texttt{func\_render\_dataframe2Markdown()}.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{4}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{}pd.set\PYZus{}option(\PYZsq{}display.max\PYZus{}rows\PYZsq{}, None)}
\PY{n}{str\PYZus{}caption} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Get all rows of Iris dataframe}\PY{l+s+s1}{\PYZsq{}}
\PY{n}{func\PYZus{}render\PYZus{}dataframe2Markdown}\PY{p}{(}\PY{n}{irisdata\PYZus{}df}\PY{p}{,} \PY{n}{str\PYZus{}caption}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{longtable}[]{@{}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.0595}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.1905}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.1786}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.1905}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.1786}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.2024}}@{}}
\caption{Get all rows of Iris dataframe}\tabularnewline
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
species
\end{minipage} \\
\midrule\noalign{}
\endfirsthead
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
species
\end{minipage} \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
0 & 5.1 & 3.5 & 1.4 & 0.2 & Iris-setosa \\
1 & 4.9 & 3 & 1.4 & 0.2 & Iris-setosa \\
2 & 4.7 & 3.2 & 1.3 & 0.2 & Iris-setosa \\
3 & 4.6 & 3.1 & 1.5 & 0.2 & Iris-setosa \\
4 & 5 & 3.6 & 1.4 & 0.2 & Iris-setosa \\
5 & 5.4 & 3.9 & 1.7 & 0.4 & Iris-setosa \\
6 & 4.6 & 3.4 & 1.4 & 0.3 & Iris-setosa \\
7 & 5 & 3.4 & 1.5 & 0.2 & Iris-setosa \\
8 & 4.4 & 2.9 & 1.4 & 0.2 & Iris-setosa \\
9 & 4.9 & 3.1 & 1.5 & 0.1 & Iris-setosa \\
10 & 5.4 & 3.7 & 1.5 & 0.2 & Iris-setosa \\
11 & 4.8 & 3.4 & 1.6 & 0.2 & Iris-setosa \\
12 & 4.8 & 3 & 1.4 & 0.1 & Iris-setosa \\
13 & 4.3 & 3 & 1.1 & 0.1 & Iris-setosa \\
14 & 5.8 & 4 & 1.2 & 0.2 & Iris-setosa \\
15 & 5.7 & 4.4 & 1.5 & 0.4 & Iris-setosa \\
16 & 5.4 & 3.9 & 1.3 & 0.4 & Iris-setosa \\
17 & 5.1 & 3.5 & 1.4 & 0.3 & Iris-setosa \\
18 & 5.7 & 3.8 & 1.7 & 0.3 & Iris-setosa \\
19 & 5.1 & 3.8 & 1.5 & 0.3 & Iris-setosa \\
20 & 5.4 & 3.4 & 1.7 & 0.2 & Iris-setosa \\
21 & 5.1 & 3.7 & 1.5 & 0.4 & Iris-setosa \\
22 & 4.6 & 3.6 & 1 & 0.2 & Iris-setosa \\
23 & 5.1 & 3.3 & 1.7 & 0.5 & Iris-setosa \\
24 & 4.8 & 3.4 & 1.9 & 0.2 & Iris-setosa \\
25 & 5 & 3 & 1.6 & 0.2 & Iris-setosa \\
26 & 5 & 3.4 & 1.6 & 0.4 & Iris-setosa \\
27 & 5.2 & 3.5 & 1.5 & 0.2 & Iris-setosa \\
28 & 5.2 & 3.4 & 1.4 & 0.2 & Iris-setosa \\
29 & 4.7 & 3.2 & 1.6 & 0.2 & Iris-setosa \\
30 & 4.8 & 3.1 & 1.6 & 0.2 & Iris-setosa \\
31 & 5.4 & 3.4 & 1.5 & 0.4 & Iris-setosa \\
32 & 5.2 & 4.1 & 1.5 & 0.1 & Iris-setosa \\
33 & 5.5 & 4.2 & 1.4 & 0.2 & Iris-setosa \\
34 & 4.9 & 3.1 & 1.5 & 0.1 & Iris-setosa \\
35 & 5 & 3.2 & 1.2 & 0.2 & Iris-setosa \\
36 & 5.5 & 3.5 & 1.3 & 0.2 & Iris-setosa \\
37 & 4.9 & 3.1 & 1.5 & 0.1 & Iris-setosa \\
38 & 4.4 & 3 & 1.3 & 0.2 & Iris-setosa \\
39 & 5.1 & 3.4 & 1.5 & 0.2 & Iris-setosa \\
40 & 5 & 3.5 & 1.3 & 0.3 & Iris-setosa \\
41 & 4.5 & 2.3 & 1.3 & 0.3 & Iris-setosa \\
42 & 4.4 & 3.2 & 1.3 & 0.2 & Iris-setosa \\
43 & 5 & 3.5 & 1.6 & 0.6 & Iris-setosa \\
44 & 5.1 & 3.8 & 1.9 & 0.4 & Iris-setosa \\
45 & 4.8 & 3 & 1.4 & 0.3 & Iris-setosa \\
46 & 5.1 & 3.8 & 1.6 & 0.2 & Iris-setosa \\
47 & 4.6 & 3.2 & 1.4 & 0.2 & Iris-setosa \\
48 & 5.3 & 3.7 & 1.5 & 0.2 & Iris-setosa \\
49 & 5 & 3.3 & 1.4 & 0.2 & Iris-setosa \\
50 & 7 & 3.2 & 4.7 & 1.4 & Iris-versicolor \\
51 & 6.4 & 3.2 & 4.5 & 1.5 & Iris-versicolor \\
52 & 6.9 & 3.1 & 4.9 & 1.5 & Iris-versicolor \\
53 & 5.5 & 2.3 & 4 & 1.3 & Iris-versicolor \\
54 & 6.5 & 2.8 & 4.6 & 1.5 & Iris-versicolor \\
55 & 5.7 & 2.8 & 4.5 & 1.3 & Iris-versicolor \\
56 & 6.3 & 3.3 & 4.7 & 1.6 & Iris-versicolor \\
57 & 4.9 & 2.4 & 3.3 & 1 & Iris-versicolor \\
58 & 6.6 & 2.9 & 4.6 & 1.3 & Iris-versicolor \\
59 & 5.2 & 2.7 & 3.9 & 1.4 & Iris-versicolor \\
60 & 5 & 2 & 3.5 & 1 & Iris-versicolor \\
61 & 5.9 & 3 & 4.2 & 1.5 & Iris-versicolor \\
62 & 6 & 2.2 & 4 & 1 & Iris-versicolor \\
63 & 6.1 & 2.9 & 4.7 & 1.4 & Iris-versicolor \\
64 & 5.6 & 2.9 & 3.6 & 1.3 & Iris-versicolor \\
65 & 6.7 & 3.1 & 4.4 & 1.4 & Iris-versicolor \\
66 & 5.6 & 3 & 4.5 & 1.5 & Iris-versicolor \\
67 & 5.8 & 2.7 & 4.1 & 1 & Iris-versicolor \\
68 & 6.2 & 2.2 & 4.5 & 1.5 & Iris-versicolor \\
69 & 5.6 & 2.5 & 3.9 & 1.1 & Iris-versicolor \\
70 & 5.9 & 3.2 & 4.8 & 1.8 & Iris-versicolor \\
71 & 6.1 & 2.8 & 4 & 1.3 & Iris-versicolor \\
72 & 6.3 & 2.5 & 4.9 & 1.5 & Iris-versicolor \\
73 & 6.1 & 2.8 & 4.7 & 1.2 & Iris-versicolor \\
74 & 6.4 & 2.9 & 4.3 & 1.3 & Iris-versicolor \\
75 & 6.6 & 3 & 4.4 & 1.4 & Iris-versicolor \\
76 & 6.8 & 2.8 & 4.8 & 1.4 & Iris-versicolor \\
77 & 6.7 & 3 & 5 & 1.7 & Iris-versicolor \\
78 & 6 & 2.9 & 4.5 & 1.5 & Iris-versicolor \\
79 & 5.7 & 2.6 & 3.5 & 1 & Iris-versicolor \\
80 & 5.5 & 2.4 & 3.8 & 1.1 & Iris-versicolor \\
81 & 5.5 & 2.4 & 3.7 & 1 & Iris-versicolor \\
82 & 5.8 & 2.7 & 3.9 & 1.2 & Iris-versicolor \\
83 & 6 & 2.7 & 5.1 & 1.6 & Iris-versicolor \\
84 & 5.4 & 3 & 4.5 & 1.5 & Iris-versicolor \\
85 & 6 & 3.4 & 4.5 & 1.6 & Iris-versicolor \\
86 & 6.7 & 3.1 & 4.7 & 1.5 & Iris-versicolor \\
87 & 6.3 & 2.3 & 4.4 & 1.3 & Iris-versicolor \\
88 & 5.6 & 3 & 4.1 & 1.3 & Iris-versicolor \\
89 & 5.5 & 2.5 & 4 & 1.3 & Iris-versicolor \\
90 & 5.5 & 2.6 & 4.4 & 1.2 & Iris-versicolor \\
91 & 6.1 & 3 & 4.6 & 1.4 & Iris-versicolor \\
92 & 5.8 & 2.6 & 4 & 1.2 & Iris-versicolor \\
93 & 5 & 2.3 & 3.3 & 1 & Iris-versicolor \\
94 & 5.6 & 2.7 & 4.2 & 1.3 & Iris-versicolor \\
95 & 5.7 & 3 & 4.2 & 1.2 & Iris-versicolor \\
96 & 5.7 & 2.9 & 4.2 & 1.3 & Iris-versicolor \\
97 & 6.2 & 2.9 & 4.3 & 1.3 & Iris-versicolor \\
98 & 5.1 & 2.5 & 3 & 1.1 & Iris-versicolor \\
99 & 5.7 & 2.8 & 4.1 & 1.3 & Iris-versicolor \\
100 & 6.3 & 3.3 & 6 & 2.5 & Iris-virginica \\
101 & 5.8 & 2.7 & 5.1 & 1.9 & Iris-virginica \\
102 & 7.1 & 3 & 5.9 & 2.1 & Iris-virginica \\
103 & 6.3 & 2.9 & 5.6 & 1.8 & Iris-virginica \\
104 & 6.5 & 3 & 5.8 & 2.2 & Iris-virginica \\
105 & 7.6 & 3 & 6.6 & 2.1 & Iris-virginica \\
106 & 4.9 & 2.5 & 4.5 & 1.7 & Iris-virginica \\
107 & 7.3 & 2.9 & 6.3 & 1.8 & Iris-virginica \\
108 & 6.7 & 2.5 & 5.8 & 1.8 & Iris-virginica \\
109 & 7.2 & 3.6 & 6.1 & 2.5 & Iris-virginica \\
110 & 6.5 & 3.2 & 5.1 & 2 & Iris-virginica \\
111 & 6.4 & 2.7 & 5.3 & 1.9 & Iris-virginica \\
112 & 6.8 & 3 & 5.5 & 2.1 & Iris-virginica \\
113 & 5.7 & 2.5 & 5 & 2 & Iris-virginica \\
114 & 5.8 & 2.8 & 5.1 & 2.4 & Iris-virginica \\
115 & 6.4 & 3.2 & 5.3 & 2.3 & Iris-virginica \\
116 & 6.5 & 3 & 5.5 & 1.8 & Iris-virginica \\
117 & 7.7 & 3.8 & 6.7 & 2.2 & Iris-virginica \\
118 & 7.7 & 2.6 & 6.9 & 2.3 & Iris-virginica \\
119 & 6 & 2.2 & 5 & 1.5 & Iris-virginica \\
120 & 6.9 & 3.2 & 5.7 & 2.3 & Iris-virginica \\
121 & 5.6 & 2.8 & 4.9 & 2 & Iris-virginica \\
122 & 7.7 & 2.8 & 6.7 & 2 & Iris-virginica \\
123 & 6.3 & 2.7 & 4.9 & 1.8 & Iris-virginica \\
124 & 6.7 & 3.3 & 5.7 & 2.1 & Iris-virginica \\
125 & 7.2 & 3.2 & 6 & 1.8 & Iris-virginica \\
126 & 6.2 & 2.8 & 4.8 & 1.8 & Iris-virginica \\
127 & 6.1 & 3 & 4.9 & 1.8 & Iris-virginica \\
128 & 6.4 & 2.8 & 5.6 & 2.1 & Iris-virginica \\
129 & 7.2 & 3 & 5.8 & 1.6 & Iris-virginica \\
130 & 7.4 & 2.8 & 6.1 & 1.9 & Iris-virginica \\
131 & 7.9 & 3.8 & 6.4 & 2 & Iris-virginica \\
132 & 6.4 & 2.8 & 5.6 & 2.2 & Iris-virginica \\
133 & 6.3 & 2.8 & 5.1 & 1.5 & Iris-virginica \\
134 & 6.1 & 2.6 & 5.6 & 1.4 & Iris-virginica \\
135 & 7.7 & 3 & 6.1 & 2.3 & Iris-virginica \\
136 & 6.3 & 3.4 & 5.6 & 2.4 & Iris-virginica \\
137 & 6.4 & 3.1 & 5.5 & 1.8 & Iris-virginica \\
138 & 6 & 3 & 4.8 & 1.8 & Iris-virginica \\
139 & 6.9 & 3.1 & 5.4 & 2.1 & Iris-virginica \\
140 & 6.7 & 3.1 & 5.6 & 2.4 & Iris-virginica \\
141 & 6.9 & 3.1 & 5.1 & 2.3 & Iris-virginica \\
142 & 5.8 & 2.7 & 5.1 & 1.9 & Iris-virginica \\
143 & 6.8 & 3.2 & 5.9 & 2.3 & Iris-virginica \\
144 & 6.7 & 3.3 & 5.7 & 2.5 & Iris-virginica \\
145 & 6.7 & 3 & 5.2 & 2.3 & Iris-virginica \\
146 & 6.3 & 2.5 & 5 & 1.9 & Iris-virginica \\
147 & 6.5 & 3 & 5.2 & 2 & Iris-virginica \\
148 & 6.2 & 3.4 & 5.4 & 2.3 & Iris-virginica \\
149 & 5.9 & 3 & 5.1 & 1.8 & Iris-virginica \\
\end{longtable}

    
    Print first or last 10 rows of rendered dataframe:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{5}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{str\PYZus{}caption} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Get first 10 rows of Iris dataframe}\PY{l+s+s1}{\PYZsq{}}
\PY{n}{func\PYZus{}render\PYZus{}dataframe2Markdown}\PY{p}{(}\PY{n}{irisdata\PYZus{}df}\PY{o}{.}\PY{n}{head}\PY{p}{(}\PY{l+m+mi}{10}\PY{p}{)}\PY{p}{,} \PY{n}{str\PYZus{}caption}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{longtable}[]{@{}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.0506}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.2025}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.1899}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.2025}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.1899}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.1646}}@{}}
\caption{Get first 10 rows of Iris dataframe}\tabularnewline
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
species
\end{minipage} \\
\midrule\noalign{}
\endfirsthead
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
species
\end{minipage} \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
0 & 5.1 & 3.5 & 1.4 & 0.2 & Iris-setosa \\
1 & 4.9 & 3 & 1.4 & 0.2 & Iris-setosa \\
2 & 4.7 & 3.2 & 1.3 & 0.2 & Iris-setosa \\
3 & 4.6 & 3.1 & 1.5 & 0.2 & Iris-setosa \\
4 & 5 & 3.6 & 1.4 & 0.2 & Iris-setosa \\
5 & 5.4 & 3.9 & 1.7 & 0.4 & Iris-setosa \\
6 & 4.6 & 3.4 & 1.4 & 0.3 & Iris-setosa \\
7 & 5 & 3.4 & 1.5 & 0.2 & Iris-setosa \\
8 & 4.4 & 2.9 & 1.4 & 0.2 & Iris-setosa \\
9 & 4.9 & 3.1 & 1.5 & 0.1 & Iris-setosa \\
\end{longtable}

    
    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{6}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{str\PYZus{}caption} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Get last 10 rows of Iris dataframe}\PY{l+s+s1}{\PYZsq{}}
\PY{n}{func\PYZus{}render\PYZus{}dataframe2Markdown}\PY{p}{(}\PY{n}{irisdata\PYZus{}df}\PY{o}{.}\PY{n}{tail}\PY{p}{(}\PY{l+m+mi}{10}\PY{p}{)}\PY{p}{,} \PY{n}{str\PYZus{}caption}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{longtable}[]{@{}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.0602}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.1928}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.1807}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.1928}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.1807}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.1928}}@{}}
\caption{Get last 10 rows of Iris dataframe}\tabularnewline
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
species
\end{minipage} \\
\midrule\noalign{}
\endfirsthead
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
species
\end{minipage} \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
140 & 6.7 & 3.1 & 5.6 & 2.4 & Iris-virginica \\
141 & 6.9 & 3.1 & 5.1 & 2.3 & Iris-virginica \\
142 & 5.8 & 2.7 & 5.1 & 1.9 & Iris-virginica \\
143 & 6.8 & 3.2 & 5.9 & 2.3 & Iris-virginica \\
144 & 6.7 & 3.3 & 5.7 & 2.5 & Iris-virginica \\
145 & 6.7 & 3 & 5.2 & 2.3 & Iris-virginica \\
146 & 6.3 & 2.5 & 5 & 1.9 & Iris-virginica \\
147 & 6.5 & 3 & 5.2 & 2 & Iris-virginica \\
148 & 6.2 & 3.4 & 5.4 & 2.3 & Iris-virginica \\
149 & 5.9 & 3 & 5.1 & 1.8 & Iris-virginica \\
\end{longtable}

    
    To be able to output the \textbf{head} and the \textbf{tail} of the
dataframe \textbf{simultaneously}, the Pandas API was extended by the
function \texttt{head\_tail(rows)} directly after the
\hyperref[import-python-packages-globally]{import of the Pandas library}.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{7}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{str\PYZus{}caption} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Get head and tail of Iris dataframe simultaneously}\PY{l+s+s1}{\PYZsq{}}
\PY{n}{func\PYZus{}render\PYZus{}dataframe2Markdown}\PY{p}{(}\PY{n}{irisdata\PYZus{}df}\PY{o}{.}\PY{n}{head\PYZus{}tail}\PY{p}{(}\PY{l+m+mi}{5}\PY{p}{)}\PY{p}{,} \PY{n}{str\PYZus{}caption}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{longtable}[]{@{}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.0602}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.1928}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.1807}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.1928}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.1807}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.1928}}@{}}
\caption{Get head and tail of Iris dataframe
simultaneously}\tabularnewline
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
species
\end{minipage} \\
\midrule\noalign{}
\endfirsthead
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
species
\end{minipage} \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
0 & 5.1 & 3.5 & 1.4 & 0.2 & Iris-setosa \\
1 & 4.9 & 3 & 1.4 & 0.2 & Iris-setosa \\
2 & 4.7 & 3.2 & 1.3 & 0.2 & Iris-setosa \\
3 & 4.6 & 3.1 & 1.5 & 0.2 & Iris-setosa \\
4 & 5 & 3.6 & 1.4 & 0.2 & Iris-setosa \\
145 & 6.7 & 3 & 5.2 & 2.3 & Iris-virginica \\
146 & 6.3 & 2.5 & 5 & 1.9 & Iris-virginica \\
147 & 6.5 & 3 & 5.2 & 2 & Iris-virginica \\
148 & 6.2 & 3.4 & 5.4 & 2.3 & Iris-virginica \\
149 & 5.9 & 3 & 5.1 & 1.8 & Iris-virginica \\
\end{longtable}

    
    \hypertarget{get-data-types-and-basic-statistical-data}{%
\subsubsection{Get data types and basic statistical
data}\label{get-data-types-and-basic-statistical-data}}

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{8}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{irisdata\PYZus{}df}\PY{o}{.}\PY{n}{info}\PY{p}{(}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{Verbatim}[commandchars=\\\{\}]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 \#   Column        Non-Null Count  Dtype
---  ------        --------------  -----
 0   sepal\_length  150 non-null    float64
 1   sepal\_width   150 non-null    float64
 2   petal\_length  150 non-null    float64
 3   petal\_width   150 non-null    float64
 4   species       150 non-null    object
dtypes: float64(4), object(1)
memory usage: 6.0+ KB
    \end{Verbatim}

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{9}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{str\PYZus{}caption} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Get some basic statistical data of Iris dataframe}\PY{l+s+s1}{\PYZsq{}}
\PY{n}{func\PYZus{}render\PYZus{}dataframe2Markdown}\PY{p}{(}\PY{n}{irisdata\PYZus{}df}\PY{o}{.}\PY{n}{describe}\PY{p}{(}\PY{p}{)}\PY{p}{,} \PY{n}{str\PYZus{}caption}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{longtable}[]{@{}
  >{\raggedright\arraybackslash}p{(\columnwidth - 8\tabcolsep) * \real{0.1014}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 8\tabcolsep) * \real{0.2319}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 8\tabcolsep) * \real{0.2174}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 8\tabcolsep) * \real{0.2319}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 8\tabcolsep) * \real{0.2174}}@{}}
\caption{Get some basic statistical data of Iris
dataframe}\tabularnewline
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedright
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_width
\end{minipage} \\
\midrule\noalign{}
\endfirsthead
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedright
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_width
\end{minipage} \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
count & 150 & 150 & 150 & 150 \\
mean & 5.84333 & 3.054 & 3.75867 & 1.19867 \\
std & 0.828066 & 0.433594 & 1.76442 & 0.763161 \\
min & 4.3 & 2 & 1 & 0.1 \\
25\% & 5.1 & 2.8 & 1.6 & 0.3 \\
50\% & 5.8 & 3 & 4.35 & 1.3 \\
75\% & 6.4 & 3.3 & 5.1 & 1.8 \\
max & 7.9 & 4.4 & 6.9 & 2.5 \\
\end{longtable}

    
    \hypertarget{get-data-ranges-and-distribution}{%
\subsubsection{Get data ranges and
distribution}\label{get-data-ranges-and-distribution}}

    \hypertarget{histograms}{%
\paragraph{Histograms}\label{histograms}}

This type of visualization is useful to explore the \textbf{frequency
distribution} for each feature in univariate plots. This requires the
separation of the data into classes (so-called \textbf{bins}). These
bins are represented in the histogram as rectangles of equal or variable
width. The height of each rectangle then represents the (relative or
absolute) \textbf{frequency density}, that is, the (relative or
absolute) \textbf{frequency divided by the width} of the corresponding
\textbf{class}.

Each \textbf{feature} of the \textbf{Iris dataset} is displayed in its
own histogram.

To illustrate the principle, the histogram subplots are first presented
in a \textbf{not very elegant code} with many repetitions:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{10}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Number of bins for the histogram}
\PY{n}{n\PYZus{}bins} \PY{o}{=} \PY{l+m+mi}{10}
\PY{n}{fig}\PY{p}{,} \PY{n}{axs} \PY{o}{=} \PY{n}{plt}\PY{o}{.}\PY{n}{subplots}\PY{p}{(}\PY{l+m+mi}{2}\PY{p}{,} \PY{l+m+mi}{2}\PY{p}{,} \PY{n}{figsize}\PY{o}{=}\PY{p}{(}\PY{l+m+mi}{12}\PY{p}{,} \PY{l+m+mi}{10}\PY{p}{)}\PY{p}{)}
\PY{c+c1}{\PYZsh{} Set margins between subplots}
\PY{n}{plt}\PY{o}{.}\PY{n}{subplots\PYZus{}adjust}\PY{p}{(}\PY{n}{wspace}\PY{o}{=}\PY{l+m+mf}{0.3}\PY{p}{,} \PY{n}{hspace}\PY{o}{=}\PY{l+m+mf}{0.3}\PY{p}{)}

\PY{n}{axs}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{,}\PY{l+m+mi}{0}\PY{p}{]}\PY{o}{.}\PY{n}{hist}\PY{p}{(}\PY{n}{irisdata\PYZus{}df}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{sepal\PYZus{}length}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}\PY{p}{,} \PY{n}{bins} \PY{o}{=} \PY{n}{n\PYZus{}bins}\PY{p}{,} \PY{n}{rwidth}\PY{o}{=}\PY{l+m+mf}{0.95}\PY{p}{,} 
              \PY{n}{density}\PY{o}{=}\PY{k+kc}{False}\PY{p}{,} \PY{n}{alpha}\PY{o}{=}\PY{l+m+mf}{0.8}\PY{p}{)}
\PY{n}{axs}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{,}\PY{l+m+mi}{0}\PY{p}{]}\PY{o}{.}\PY{n}{set\PYZus{}title}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Sepal Length}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}
\PY{c+c1}{\PYZsh{} Show grid}
\PY{n}{axs}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{,}\PY{l+m+mi}{0}\PY{p}{]}\PY{o}{.}\PY{n}{grid}\PY{p}{(}\PY{n}{visible}\PY{o}{=}\PY{k+kc}{True}\PY{p}{)}
\PY{c+c1}{\PYZsh{} Hide grid behind the bars}
\PY{n}{axs}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{,}\PY{l+m+mi}{0}\PY{p}{]}\PY{o}{.}\PY{n}{set\PYZus{}axisbelow}\PY{p}{(}\PY{k+kc}{True}\PY{p}{)}
\PY{c+c1}{\PYZsh{} Label x and y\PYZhy{}axis}
\PY{n}{axs}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{,}\PY{l+m+mi}{0}\PY{p}{]}\PY{o}{.}\PY{n}{set\PYZus{}xlabel}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{value range [cm]}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}
\PY{n}{axs}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{,}\PY{l+m+mi}{0}\PY{p}{]}\PY{o}{.}\PY{n}{set\PYZus{}ylabel}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{frequency density (absolute)}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}

\PY{n}{axs}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{,}\PY{l+m+mi}{1}\PY{p}{]}\PY{o}{.}\PY{n}{hist}\PY{p}{(}\PY{n}{irisdata\PYZus{}df}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{sepal\PYZus{}width}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}\PY{p}{,} \PY{n}{bins} \PY{o}{=} \PY{n}{n\PYZus{}bins}\PY{p}{,} \PY{n}{rwidth}\PY{o}{=}\PY{l+m+mf}{0.95}\PY{p}{,} 
              \PY{n}{density}\PY{o}{=}\PY{k+kc}{False}\PY{p}{,} \PY{n}{alpha}\PY{o}{=}\PY{l+m+mf}{0.8}\PY{p}{)}
\PY{n}{axs}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{,}\PY{l+m+mi}{1}\PY{p}{]}\PY{o}{.}\PY{n}{set\PYZus{}title}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Sepal Width}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}
\PY{c+c1}{\PYZsh{} Show grid}
\PY{n}{axs}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{,}\PY{l+m+mi}{1}\PY{p}{]}\PY{o}{.}\PY{n}{grid}\PY{p}{(}\PY{n}{visible}\PY{o}{=}\PY{k+kc}{True}\PY{p}{)}
\PY{c+c1}{\PYZsh{} Hide grid behind the bars}
\PY{n}{axs}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{,}\PY{l+m+mi}{1}\PY{p}{]}\PY{o}{.}\PY{n}{set\PYZus{}axisbelow}\PY{p}{(}\PY{k+kc}{True}\PY{p}{)}
\PY{c+c1}{\PYZsh{} Label x and y\PYZhy{}axis}
\PY{n}{axs}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{,}\PY{l+m+mi}{1}\PY{p}{]}\PY{o}{.}\PY{n}{set\PYZus{}xlabel}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{value range [cm]}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}
\PY{n}{axs}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{,}\PY{l+m+mi}{1}\PY{p}{]}\PY{o}{.}\PY{n}{set\PYZus{}ylabel}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{frequency density (absolute)}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}

\PY{n}{axs}\PY{p}{[}\PY{l+m+mi}{1}\PY{p}{,}\PY{l+m+mi}{0}\PY{p}{]}\PY{o}{.}\PY{n}{hist}\PY{p}{(}\PY{n}{irisdata\PYZus{}df}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{petal\PYZus{}length}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}\PY{p}{,} \PY{n}{bins} \PY{o}{=} \PY{n}{n\PYZus{}bins}\PY{p}{,} \PY{n}{rwidth}\PY{o}{=}\PY{l+m+mf}{0.95}\PY{p}{,} 
              \PY{n}{density}\PY{o}{=}\PY{k+kc}{False}\PY{p}{,} \PY{n}{alpha}\PY{o}{=}\PY{l+m+mf}{0.8}\PY{p}{)}
\PY{n}{axs}\PY{p}{[}\PY{l+m+mi}{1}\PY{p}{,}\PY{l+m+mi}{0}\PY{p}{]}\PY{o}{.}\PY{n}{set\PYZus{}title}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Petal Length}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}
\PY{c+c1}{\PYZsh{} Show grid}
\PY{n}{axs}\PY{p}{[}\PY{l+m+mi}{1}\PY{p}{,}\PY{l+m+mi}{0}\PY{p}{]}\PY{o}{.}\PY{n}{grid}\PY{p}{(}\PY{n}{visible}\PY{o}{=}\PY{k+kc}{True}\PY{p}{)}
\PY{c+c1}{\PYZsh{} Hide grid behind the bars}
\PY{n}{axs}\PY{p}{[}\PY{l+m+mi}{1}\PY{p}{,}\PY{l+m+mi}{0}\PY{p}{]}\PY{o}{.}\PY{n}{set\PYZus{}axisbelow}\PY{p}{(}\PY{k+kc}{True}\PY{p}{)}
\PY{c+c1}{\PYZsh{} Label x and y\PYZhy{}axis}
\PY{n}{axs}\PY{p}{[}\PY{l+m+mi}{1}\PY{p}{,}\PY{l+m+mi}{0}\PY{p}{]}\PY{o}{.}\PY{n}{set\PYZus{}xlabel}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{value range [cm]}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}
\PY{n}{axs}\PY{p}{[}\PY{l+m+mi}{1}\PY{p}{,}\PY{l+m+mi}{0}\PY{p}{]}\PY{o}{.}\PY{n}{set\PYZus{}ylabel}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{frequency density (absolute)}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}

\PY{n}{axs}\PY{p}{[}\PY{l+m+mi}{1}\PY{p}{,}\PY{l+m+mi}{1}\PY{p}{]}\PY{o}{.}\PY{n}{hist}\PY{p}{(}\PY{n}{irisdata\PYZus{}df}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{petal\PYZus{}width}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}\PY{p}{,} \PY{n}{bins} \PY{o}{=} \PY{n}{n\PYZus{}bins}\PY{p}{,} \PY{n}{rwidth}\PY{o}{=}\PY{l+m+mf}{0.95}\PY{p}{,} 
              \PY{n}{density}\PY{o}{=}\PY{k+kc}{False}\PY{p}{,} \PY{n}{alpha}\PY{o}{=}\PY{l+m+mf}{0.8}\PY{p}{)}
\PY{n}{axs}\PY{p}{[}\PY{l+m+mi}{1}\PY{p}{,}\PY{l+m+mi}{1}\PY{p}{]}\PY{o}{.}\PY{n}{set\PYZus{}title}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Petal Width}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}
\PY{c+c1}{\PYZsh{} Show grid}
\PY{n}{axs}\PY{p}{[}\PY{l+m+mi}{1}\PY{p}{,}\PY{l+m+mi}{1}\PY{p}{]}\PY{o}{.}\PY{n}{grid}\PY{p}{(}\PY{n}{visible}\PY{o}{=}\PY{k+kc}{True}\PY{p}{)}
\PY{c+c1}{\PYZsh{} Hide grid behind the bars}
\PY{n}{axs}\PY{p}{[}\PY{l+m+mi}{1}\PY{p}{,}\PY{l+m+mi}{1}\PY{p}{]}\PY{o}{.}\PY{n}{set\PYZus{}axisbelow}\PY{p}{(}\PY{k+kc}{True}\PY{p}{)}
\PY{c+c1}{\PYZsh{} Label x and y\PYZhy{}axis}
\PY{n}{axs}\PY{p}{[}\PY{l+m+mi}{1}\PY{p}{,}\PY{l+m+mi}{1}\PY{p}{]}\PY{o}{.}\PY{n}{set\PYZus{}xlabel}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{value range [cm]}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}
\PY{n}{axs}\PY{p}{[}\PY{l+m+mi}{1}\PY{p}{,}\PY{l+m+mi}{1}\PY{p}{]}\PY{o}{.}\PY{n}{set\PYZus{}ylabel}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{frequency density (absolute)}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}

\PY{n}{fig}\PY{o}{.}\PY{n}{savefig}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{./images/Iris\PYZus{}histograms.png}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{dpi}\PY{o}{=}\PY{l+m+mi}{150}\PY{p}{,} \PY{n}{pad\PYZus{}inches}\PY{o}{=}\PY{l+m+mi}{5}\PY{p}{)}

\PY{n}{plt}\PY{o}{.}\PY{n}{show}\PY{p}{(}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{figure}
        \begin{center}\adjustimage{max size={0.9\linewidth}{0.9\paperheight}}{Step-by-step_intro_to_ML_with_SVC_and_Iris_files/Step-by-step_intro_to_ML_with_SVC_and_Iris_56_0.png}\end{center}
        \caption{Histograms used to explore the absolute frequency distribution of the 4 features in the Iris dataset}
        \label{fig:histogram_iris_simple}
    \end{figure}
    
    To improve the code, the function \texttt{subplots.flatten()} converts
the subplot array to an iterable list. Afterwards, a loop allows to
iterate through the subplots - this \textbf{saves many repetitions} in
the code.

In addition, \textbf{probability density functions (PDF)} were overlaid
on the histograms, whose hyper-parameters \textbf{mean} and
\textbf{standard deviation} were previously identified using the
features of the dataset. This makes it possible to estimate whether the
\textbf{data is normally distributed}. In order to be able to reuse the
code later, it was implemented as the \textbf{function}
\texttt{func\_plot\_histograms\_with\_PDF()}.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{11}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{k+kn}{from} \PY{n+nn}{scipy}\PY{n+nn}{.}\PY{n+nn}{stats} \PY{k+kn}{import} \PY{n}{norm}

\PY{k}{def} \PY{n+nf}{func\PYZus{}plot\PYZus{}histograms\PYZus{}with\PYZus{}PDF}\PY{p}{(}\PY{n}{df}\PY{p}{,} \PY{n}{features}\PY{p}{,} \PY{n}{titles}\PY{p}{)}\PY{p}{:}
    \PY{c+c1}{\PYZsh{} Number of bins for the histogram}
    \PY{c+c1}{\PYZsh{} \PYZhy{} bins=\PYZlt{}integer\PYZgt{}: defines the number of equal\PYZhy{}width bins in the range}
    \PY{c+c1}{\PYZsh{} \PYZhy{} bins=\PYZlt{}string\PYZgt{}: one of the binning strategies is used:}
    \PY{c+c1}{\PYZsh{}   \PYZsq{}auto\PYZsq{}, \PYZsq{}fd\PYZsq{}, \PYZsq{}doane\PYZsq{}, \PYZsq{}scott\PYZsq{}, \PYZsq{}stone\PYZsq{}, \PYZsq{}rice\PYZsq{}, \PYZsq{}sturges\PYZsq{}, or \PYZsq{}sqrt\PYZsq{}}
    \PY{n}{n\PYZus{}bins} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{auto}\PY{l+s+s1}{\PYZsq{}}
    \PY{n}{fig}\PY{p}{,} \PY{n}{subplots} \PY{o}{=} \PY{n}{plt}\PY{o}{.}\PY{n}{subplots}\PY{p}{(}\PY{l+m+mi}{2}\PY{p}{,} \PY{l+m+mi}{2}\PY{p}{,} \PY{n}{figsize}\PY{o}{=}\PY{p}{(}\PY{l+m+mi}{12}\PY{p}{,} \PY{l+m+mi}{10}\PY{p}{)}\PY{p}{)}
    \PY{c+c1}{\PYZsh{} Set margins between subplots}
    \PY{n}{plt}\PY{o}{.}\PY{n}{subplots\PYZus{}adjust}\PY{p}{(}\PY{n}{wspace}\PY{o}{=}\PY{l+m+mf}{0.3}\PY{p}{,} \PY{n}{hspace}\PY{o}{=}\PY{l+m+mf}{0.35}\PY{p}{)}

    \PY{c+c1}{\PYZsh{} Make subplots iterable via \PYZsq{}subplots.flatten()\PYZsq{}}
    \PY{k}{for} \PY{n}{feature}\PY{p}{,} \PY{n}{title}\PY{p}{,} \PY{n}{subplot} \PY{o+ow}{in} \PY{n+nb}{zip}\PY{p}{(}\PY{n}{features}\PY{p}{,} \PY{n}{titles}\PY{p}{,} \PY{n}{subplots}\PY{o}{.}\PY{n}{flatten}\PY{p}{(}\PY{p}{)}\PY{p}{)}\PY{p}{:}
        \PY{n}{subplot}\PY{o}{.}\PY{n}{hist}\PY{p}{(}\PY{n}{df}\PY{p}{[}\PY{n}{feature}\PY{p}{]}\PY{p}{,} \PY{n}{bins} \PY{o}{=} \PY{n}{n\PYZus{}bins}\PY{p}{,} \PY{n}{rwidth}\PY{o}{=}\PY{l+m+mf}{0.95}\PY{p}{,}
                     \PY{n}{density}\PY{o}{=}\PY{k+kc}{True}\PY{p}{,} \PY{n}{alpha}\PY{o}{=}\PY{l+m+mf}{0.8}\PY{p}{)}

        \PY{c+c1}{\PYZsh{} Fit a normal distribution to the data}
        \PY{c+c1}{\PYZsh{} with mean and standard deviation}
        \PY{n}{mu}\PY{p}{,} \PY{n}{std} \PY{o}{=} \PY{n}{norm}\PY{o}{.}\PY{n}{fit}\PY{p}{(}\PY{n}{df}\PY{p}{[}\PY{n}{feature}\PY{p}{]}\PY{p}{)}

        \PY{c+c1}{\PYZsh{} Plot the probability density function (PDF)}
        \PY{n}{xmin}\PY{p}{,} \PY{n}{xmax} \PY{o}{=} \PY{n}{subplot}\PY{o}{.}\PY{n}{get\PYZus{}xlim}\PY{p}{(}\PY{p}{)}
        \PY{n}{x} \PY{o}{=} \PY{n}{np}\PY{o}{.}\PY{n}{linspace}\PY{p}{(}\PY{n}{xmin}\PY{p}{,} \PY{n}{xmax}\PY{p}{,} \PY{l+m+mi}{100}\PY{p}{)}
        \PY{n}{p} \PY{o}{=} \PY{n}{norm}\PY{o}{.}\PY{n}{pdf}\PY{p}{(}\PY{n}{x}\PY{p}{,} \PY{n}{mu}\PY{p}{,} \PY{n}{std}\PY{p}{)}
        \PY{n}{subplot}\PY{o}{.}\PY{n}{plot}\PY{p}{(}\PY{n}{x}\PY{p}{,} \PY{n}{p}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{k}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{linewidth}\PY{o}{=}\PY{l+m+mi}{2}\PY{p}{)}

        \PY{n}{title\PYZus{}concat} \PY{o}{=} \PY{l+s+s2}{\PYZdq{}}\PY{l+s+si}{\PYZob{}\PYZcb{}}\PY{l+s+s2}{ (Mean: }\PY{l+s+si}{\PYZob{}:.2f\PYZcb{}}\PY{l+s+s2}{, }\PY{l+s+s2}{\PYZdq{}} \PYZbs{}
                       \PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{Std. deviation: }\PY{l+s+si}{\PYZob{}:.2f\PYZcb{}}\PY{l+s+s2}{)}\PY{l+s+s2}{\PYZdq{}}\PY{o}{.}\PY{n}{format}\PY{p}{(}\PY{n}{title}\PY{p}{,} \PY{n}{mu}\PY{p}{,} \PY{n}{std}\PY{p}{)}
        \PY{c+c1}{\PYZsh{} Set the title of the histogram}
        \PY{c+c1}{\PYZsh{} pad ... defines the distance of the title from the top of the histogram}
        \PY{n}{subplot}\PY{o}{.}\PY{n}{set\PYZus{}title}\PY{p}{(}\PY{n}{title\PYZus{}concat}\PY{p}{,} \PY{n}{pad}\PY{o}{=}\PY{l+m+mi}{10}\PY{p}{)}
        \PY{c+c1}{\PYZsh{} Show grid}
        \PY{n}{subplot}\PY{o}{.}\PY{n}{grid}\PY{p}{(}\PY{n}{visible}\PY{o}{=}\PY{k+kc}{True}\PY{p}{)}
        \PY{c+c1}{\PYZsh{} Hide grid behind the bars}
        \PY{n}{subplot}\PY{o}{.}\PY{n}{set\PYZus{}axisbelow}\PY{p}{(}\PY{k+kc}{True}\PY{p}{)}
        \PY{c+c1}{\PYZsh{} Label x and y\PYZhy{}axis}
        \PY{n}{subplot}\PY{o}{.}\PY{n}{set\PYZus{}xlabel}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{value range [cm]}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}
        \PY{n}{subplot}\PY{o}{.}\PY{n}{set\PYZus{}ylabel}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{frequency density (relative)}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}
        
        \PY{n}{fig}\PY{o}{.}\PY{n}{savefig}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{./images/Iris\PYZus{}histograms\PYZus{}pdf.png}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{dpi}\PY{o}{=}\PY{l+m+mi}{150}\PY{p}{,} \PY{n}{pad\PYZus{}inches}\PY{o}{=}\PY{l+m+mi}{5}\PY{p}{)}

    \PY{n}{plt}\PY{o}{.}\PY{n}{show}\PY{p}{(}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    Call the new function to plot the \textbf{histograms} with overlaid
\textbf{probability density functions}:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{12}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{features} \PY{o}{=} \PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{sepal\PYZus{}length}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{sepal\PYZus{}width}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{petal\PYZus{}length}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{petal\PYZus{}width}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}
\PY{n}{titles} \PY{o}{=}   \PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Sepal Length}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Sepal Width}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Petal Length}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Petal Width}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}

\PY{n}{func\PYZus{}plot\PYZus{}histograms\PYZus{}with\PYZus{}PDF}\PY{p}{(}\PY{n}{irisdata\PYZus{}df}\PY{p}{,} \PY{n}{features}\PY{p}{,} \PY{n}{titles}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{figure}
        \begin{center}\adjustimage{max size={0.9\linewidth}{0.9\paperheight}}{Step-by-step_intro_to_ML_with_SVC_and_Iris_files/Step-by-step_intro_to_ML_with_SVC_and_Iris_60_0.png}\end{center}
        \caption{Histograms used to explore the relative frequency distribution of the 4 features in the Iris dataset (with improved code and overlaid probability density functions (PDF))}
        \label{fig:histogram_iris_with_PDF}
    \end{figure}
    
    \hypertarget{boxplots}{%
\paragraph{Boxplots}\label{boxplots}}

This type of visualization can be used to explore the \textbf{data
ranges} in the dataset. \textbf{Boxplots} also provide information about
\textbf{outliers}.

In the following code example, the 4 variables of the Iris dataset are
displayed side-by-side in individual boxplots. As in the previous
histogram example, a loop is used to iterate through the subplots, which
saves a lot of repetition in the code.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{13}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{fig}\PY{p}{,} \PY{n}{subplots} \PY{o}{=} \PY{n}{plt}\PY{o}{.}\PY{n}{subplots}\PY{p}{(}\PY{l+m+mi}{2}\PY{p}{,} \PY{l+m+mi}{2}\PY{p}{,} \PY{n}{figsize}\PY{o}{=}\PY{p}{(}\PY{l+m+mi}{12}\PY{p}{,} \PY{l+m+mi}{10}\PY{p}{)}\PY{p}{)}
\PY{c+c1}{\PYZsh{} Set margins between subplots}
\PY{n}{plt}\PY{o}{.}\PY{n}{subplots\PYZus{}adjust}\PY{p}{(}\PY{n}{wspace}\PY{o}{=}\PY{l+m+mf}{0.3}\PY{p}{,} \PY{n}{hspace}\PY{o}{=}\PY{l+m+mf}{0.35}\PY{p}{)}

\PY{n}{class\PYZus{}names} \PY{o}{=} \PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Iris\PYZhy{}setosa}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Iris\PYZhy{}versicolor}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Iris\PYZhy{}virginica}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}
\PY{n}{features} \PY{o}{=}    \PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{sepal\PYZus{}length}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{sepal\PYZus{}width}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{petal\PYZus{}length}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{petal\PYZus{}width}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}

\PY{c+c1}{\PYZsh{} Make subplots iterable via \PYZsq{}subplots.flatten()\PYZsq{}}
\PY{k}{for} \PY{n}{feature}\PY{p}{,} \PY{n}{subplot} \PY{o+ow}{in} \PY{n+nb}{zip}\PY{p}{(}\PY{n}{features}\PY{p}{,} \PY{n}{subplots}\PY{o}{.}\PY{n}{flatten}\PY{p}{(}\PY{p}{)}\PY{p}{)}\PY{p}{:}
    \PY{c+c1}{\PYZsh{} x, y: names of features in dataset}
    \PY{c+c1}{\PYZsh{} data: dataset for plotting}
    \PY{c+c1}{\PYZsh{} order: order to plot the class names in}
    \PY{c+c1}{\PYZsh{} ax: assignment of the plot to the subplot}
    \PY{n}{sns}\PY{o}{.}\PY{n}{boxplot}\PY{p}{(}\PY{n}{x} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{species}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{y} \PY{o}{=} \PY{n}{feature}\PY{p}{,} 
                \PY{n}{data} \PY{o}{=} \PY{n}{irisdata\PYZus{}df}\PY{p}{,} \PY{n}{order} \PY{o}{=} \PY{n}{class\PYZus{}names}\PY{p}{,} \PY{n}{ax} \PY{o}{=} \PY{n}{subplot}\PY{p}{)}
    \PY{c+c1}{\PYZsh{} Show grid}
    \PY{n}{subplot}\PY{o}{.}\PY{n}{grid}\PY{p}{(}\PY{n}{axis}\PY{o}{=}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{y}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}
    \PY{c+c1}{\PYZsh{} Hide grid behind the bars}
    \PY{n}{subplot}\PY{o}{.}\PY{n}{set\PYZus{}axisbelow}\PY{p}{(}\PY{k+kc}{True}\PY{p}{)}
    \PY{c+c1}{\PYZsh{} Set the title of the boxplot}
    \PY{c+c1}{\PYZsh{} pad ... defines the distance of the title from the top of the boxplot}
    \PY{n}{subplot}\PY{o}{.}\PY{n}{set\PYZus{}title}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Feature: }\PY{l+s+si}{\PYZob{}\PYZcb{}}\PY{l+s+s1}{\PYZsq{}}\PY{o}{.}\PY{n}{format}\PY{p}{(}\PY{n}{feature}\PY{p}{)}\PY{p}{,} \PY{n}{pad}\PY{o}{=}\PY{l+m+mi}{10}\PY{p}{)}
    \PY{c+c1}{\PYZsh{} Label y\PYZhy{}axis}
    \PY{n}{subplot}\PY{o}{.}\PY{n}{set\PYZus{}ylabel}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{value range [cm]}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}
    
    \PY{n}{fig}\PY{o}{.}\PY{n}{savefig}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{./images/Iris\PYZus{}boxplots.png}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{dpi}\PY{o}{=}\PY{l+m+mi}{150}\PY{p}{,} \PY{n}{pad\PYZus{}inches}\PY{o}{=}\PY{l+m+mi}{5}\PY{p}{)}
    
\PY{n}{plt}\PY{o}{.}\PY{n}{show}\PY{p}{(}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{figure}
        \begin{center}\adjustimage{max size={0.9\linewidth}{0.9\paperheight}}{Step-by-step_intro_to_ML_with_SVC_and_Iris_files/Step-by-step_intro_to_ML_with_SVC_and_Iris_62_0.png}\end{center}
        \caption{Boxplots used to explore the data ranges in the Iris dataset}
        \label{fig:boxplots_iris}
    \end{figure}
    
    \hypertarget{violin-plots}{%
\paragraph{Violin plots}\label{violin-plots}}

Another type of visualization is the \textbf{violin plot}, which
\textbf{combines} the advantages of both the \textbf{histogram} and the
\textbf{box plot}. As in the two previous examples, a loop is used to
iterate through the subplots, which saves a lot of repetition in the
code.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{14}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{fig}\PY{p}{,} \PY{n}{subplots} \PY{o}{=} \PY{n}{plt}\PY{o}{.}\PY{n}{subplots}\PY{p}{(}\PY{l+m+mi}{2}\PY{p}{,} \PY{l+m+mi}{2}\PY{p}{,} \PY{n}{figsize}\PY{o}{=}\PY{p}{(}\PY{l+m+mi}{12}\PY{p}{,} \PY{l+m+mi}{10}\PY{p}{)}\PY{p}{)}
\PY{c+c1}{\PYZsh{} Set margins between subplots}
\PY{n}{plt}\PY{o}{.}\PY{n}{subplots\PYZus{}adjust}\PY{p}{(}\PY{n}{wspace}\PY{o}{=}\PY{l+m+mf}{0.3}\PY{p}{,} \PY{n}{hspace}\PY{o}{=}\PY{l+m+mf}{0.35}\PY{p}{)}

\PY{n}{class\PYZus{}names} \PY{o}{=} \PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Iris\PYZhy{}setosa}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Iris\PYZhy{}versicolor}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Iris\PYZhy{}virginica}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}
\PY{n}{features} \PY{o}{=}    \PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{sepal\PYZus{}length}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{sepal\PYZus{}width}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{petal\PYZus{}length}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{petal\PYZus{}width}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}

\PY{c+c1}{\PYZsh{} Make subplots iterable via \PYZsq{}subplots.flatten()\PYZsq{}}
\PY{k}{for} \PY{n}{feature}\PY{p}{,} \PY{n}{subplot} \PY{o+ow}{in} \PY{n+nb}{zip}\PY{p}{(}\PY{n}{features}\PY{p}{,} \PY{n}{subplots}\PY{o}{.}\PY{n}{flatten}\PY{p}{(}\PY{p}{)}\PY{p}{)}\PY{p}{:}
    \PY{c+c1}{\PYZsh{} x, y: names of features in dataset}
    \PY{c+c1}{\PYZsh{} data: dataset for plotting}
    \PY{c+c1}{\PYZsh{} order: order to plot the class names in}
    \PY{c+c1}{\PYZsh{} ax: assignment of the plot to the subplot}
    \PY{n}{sns}\PY{o}{.}\PY{n}{violinplot}\PY{p}{(}\PY{n}{x} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{species}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{y} \PY{o}{=} \PY{n}{feature}\PY{p}{,} 
                   \PY{n}{data} \PY{o}{=} \PY{n}{irisdata\PYZus{}df}\PY{p}{,} \PY{n}{order} \PY{o}{=} \PY{n}{class\PYZus{}names}\PY{p}{,} \PY{n}{ax} \PY{o}{=} \PY{n}{subplot}\PY{p}{)}
    \PY{c+c1}{\PYZsh{} Show grid}
    \PY{n}{subplot}\PY{o}{.}\PY{n}{grid}\PY{p}{(}\PY{n}{axis}\PY{o}{=}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{y}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}
    \PY{c+c1}{\PYZsh{} Hide grid behind the bars}
    \PY{n}{subplot}\PY{o}{.}\PY{n}{set\PYZus{}axisbelow}\PY{p}{(}\PY{k+kc}{True}\PY{p}{)}
    \PY{c+c1}{\PYZsh{} Set the title of the violin plot}
    \PY{c+c1}{\PYZsh{} pad ... defines the distance of the title from the top of the violin plot}
    \PY{n}{subplot}\PY{o}{.}\PY{n}{set\PYZus{}title}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Feature: }\PY{l+s+si}{\PYZob{}\PYZcb{}}\PY{l+s+s1}{\PYZsq{}}\PY{o}{.}\PY{n}{format}\PY{p}{(}\PY{n}{feature}\PY{p}{)}\PY{p}{,} \PY{n}{pad}\PY{o}{=}\PY{l+m+mi}{10}\PY{p}{)}
    \PY{c+c1}{\PYZsh{} Label y\PYZhy{}axis}
    \PY{n}{subplot}\PY{o}{.}\PY{n}{set\PYZus{}ylabel}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{value range [cm]}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}
    
    \PY{n}{fig}\PY{o}{.}\PY{n}{savefig}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{./images/Iris\PYZus{}violinplots.png}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{dpi}\PY{o}{=}\PY{l+m+mi}{150}\PY{p}{,} \PY{n}{pad\PYZus{}inches}\PY{o}{=}\PY{l+m+mi}{5}\PY{p}{)}
    
\PY{n}{plt}\PY{o}{.}\PY{n}{show}\PY{p}{(}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{figure}
        \begin{center}\adjustimage{max size={0.9\linewidth}{0.9\paperheight}}{Step-by-step_intro_to_ML_with_SVC_and_Iris_files/Step-by-step_intro_to_ML_with_SVC_and_Iris_64_0.png}\end{center}
        \caption{Violin plots combine histograms and box plots}
        \label{fig:violinplots_iris}
    \end{figure}
    
    \hypertarget{identify-anomalies-in-the-datasets}{%
\subsection{\texorpdfstring{Identify \textbf{anomalies} in the
datasets}{Identify anomalies in the datasets}}\label{identify-anomalies-in-the-datasets}}

\hypertarget{find-and-repair-gaps-in-dataset}{%
\subsubsection{Find and repair gaps in
dataset}\label{find-and-repair-gaps-in-dataset}}

This section was inspired by
\href{https://www.geeksforgeeks.org/working-with-missing-data-in-pandas/}{Working
with Missing Data in Pandas}.

\hypertarget{check-for-missing-values-using-isnull}{%
\paragraph{\texorpdfstring{Check for missing values using
\texttt{isnull()}}{Check for missing values using isnull()}}\label{check-for-missing-values-using-isnull}}

In order to check for missing values in a \texttt{pandas.DataFrame}, the
function \texttt{isnull()} is used here. This function returns a
dataframe of boolean values which are \texttt{True} for \textbf{NaN
values}.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{15}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{str\PYZus{}caption} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Check for NaN values in Iris dataframe}\PY{l+s+s1}{\PYZsq{}}
\PY{n}{func\PYZus{}render\PYZus{}dataframe2Markdown}\PY{p}{(}\PY{n}{irisdata\PYZus{}df}\PY{o}{.}\PY{n}{isnull}\PY{p}{(}\PY{p}{)}\PY{o}{.}\PY{n}{head}\PY{p}{(}\PY{l+m+mi}{5}\PY{p}{)}\PY{p}{,} \PY{n}{str\PYZus{}caption}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{longtable}[]{@{}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.0519}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.2078}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.1948}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.2078}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.1948}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.1429}}@{}}
\caption{Check for NaN values in Iris dataframe}\tabularnewline
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
species
\end{minipage} \\
\midrule\noalign{}
\endfirsthead
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
species
\end{minipage} \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
0 & 0 & 0 & 0 & 0 & 0 \\
1 & 0 & 0 & 0 & 0 & 0 \\
2 & 0 & 0 & 0 & 0 & 0 \\
3 & 0 & 0 & 0 & 0 & 0 \\
4 & 0 & 0 & 0 & 0 & 0 \\
\end{longtable}

    
    Show only the gaps:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{16}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{irisdata\PYZus{}df\PYZus{}gaps} \PY{o}{=} \PY{n}{irisdata\PYZus{}df}\PY{p}{[}\PY{n}{irisdata\PYZus{}df}\PY{o}{.}\PY{n}{isnull}\PY{p}{(}\PY{p}{)}\PY{o}{.}\PY{n}{any}\PY{p}{(}\PY{n}{axis}\PY{o}{=}\PY{l+m+mi}{1}\PY{p}{)}\PY{p}{]}

\PY{n}{str\PYZus{}caption} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Show NaN values in Iris dataframe only}\PY{l+s+s1}{\PYZsq{}}
\PY{n}{func\PYZus{}render\PYZus{}dataframe2Markdown}\PY{p}{(}\PY{n}{irisdata\PYZus{}df\PYZus{}gaps}\PY{p}{,} \PY{n}{str\PYZus{}caption}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{longtable}[]{@{}
  >{\raggedright\arraybackslash}p{(\columnwidth - 8\tabcolsep) * \real{0.2192}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 8\tabcolsep) * \real{0.2055}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 8\tabcolsep) * \real{0.2192}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 8\tabcolsep) * \real{0.2055}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 8\tabcolsep) * \real{0.1507}}@{}}
\caption{Show NaN values in Iris dataframe only}\tabularnewline
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedright
sepal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
sepal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
petal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
petal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
species
\end{minipage} \\
\midrule\noalign{}
\endfirsthead
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedright
sepal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
sepal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
petal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
petal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
species
\end{minipage} \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
\end{longtable}

    
    Fine - the \textbf{Iris dataset} seems to be \textbf{complete} :)

So let's look for \textbf{another dataset to exercise}. For this
purpose, the original
\href{https://media.geeksforgeeks.org/wp-content/uploads/employees.csv}{employees
dataset}, which will be used in the next subsections, has been
\textbf{slightly modified}.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{17}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Import data to dataframe from CSV file}
\PY{n}{employees\PYZus{}df} \PY{o}{=} \PY{n}{pd}\PY{o}{.}\PY{n}{read\PYZus{}csv}\PY{p}{(}\PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{./datasets/employees\PYZus{}edit.csv}\PY{l+s+s2}{\PYZdq{}}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    For the \textbf{before-and-after comparison}, the \textbf{edited} and
the \textbf{original data frames} are connected with each other
(so-called \textbf{merging}). This merging \textbf{requires unique
identifiers} for the individual data records (rows of the data frame).
Using the \textbf{index} of the data frame for this is far too
\textbf{unreliable}, since it can \textbf{change constantly} due to
reordering or the deletion and addition of rows.

Therefore, directly after importing the dataset from the CSV file, the
dataframe \textbf{index} is \textbf{transferred to a new column} as a
permanent and stable \textbf{records identifier}.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{18}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Retrieve indices of all rows into a temporary list}
\PY{n}{li\PYZus{}idx} \PY{o}{=} \PY{n}{employees\PYZus{}df}\PY{o}{.}\PY{n}{index}

\PY{c+c1}{\PYZsh{} Insert indices as a new index column at the first position with \PYZsq{}loc=0\PYZsq{}}
\PY{n}{employees\PYZus{}df}\PY{o}{.}\PY{n}{insert}\PY{p}{(}\PY{n}{loc}\PY{o}{=}\PY{l+m+mi}{0}\PY{p}{,} \PY{n}{column}\PY{o}{=}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{idx}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{value}\PY{o}{=}\PY{n}{li\PYZus{}idx}\PY{p}{)}

\PY{n}{str\PYZus{}caption} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Get head and tail of employees dataset}\PY{l+s+s1}{\PYZsq{}}
\PY{n}{func\PYZus{}render\PYZus{}dataframe2Markdown}\PY{p}{(}\PY{n}{employees\PYZus{}df}\PY{o}{.}\PY{n}{head\PYZus{}tail}\PY{p}{(}\PY{l+m+mi}{10}\PY{p}{)}\PY{p}{,} \PY{n}{str\PYZus{}caption}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{longtable}[]{@{}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0448}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0522}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1045}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0746}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1045}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1418}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0746}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0821}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1567}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1642}}@{}}
\caption{Get head and tail of employees dataset}\tabularnewline
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
idx
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
First Name
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Gender
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Start Date
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Last Login Time
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Salary
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Bonus \%
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Senior Management
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Team
\end{minipage} \\
\midrule\noalign{}
\endfirsthead
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
idx
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
First Name
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Gender
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Start Date
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Last Login Time
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Salary
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Bonus \%
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Senior Management
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Team
\end{minipage} \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
0 & 0 & Douglas & Male & 8/6/1993 & 12:42 PM & 97308 & 6945 & 1 &
Marketing \\
1 & 1 & Thomas & Male & 3/31/1996 & 6:53 AM & 61933 & 4.17 & 1 & nan \\
2 & 2 & Maria & Female & 4/23/1993 & 11:17 AM & 130590 & 11858 & 0 &
Finance \\
3 & 3 & Jerry & Male & 3/4/2005 & 1:00 PM & 138705 & 9.34 & 1 &
Finance \\
4 & 4 & Larry & Male & 1/24/1998 & 4:47 PM & 101004 & 1389 & 1 & Client
Services \\
5 & 5 & Dennis & Male & 4/18/1987 & 1:35 AM & 115163 & 10125 & 0 &
Legal \\
6 & 6 & Ruby & Female & 8/17/1987 & 4:20 PM & 65476 & 10012 & 1 &
Product \\
7 & 7 & nan & Female & 7/20/2015 & 10:43 AM & 45906 & 11598 & nan &
Finance \\
8 & 8 & Angela & Female & 11/22/2005 & 6:29 AM & 95570 & 18523 & 1 &
Engineering \\
9 & 9 & Frances & Female & 8/8/2002 & 6:51 AM & 139852 & 7524 & 1 &
Business Development \\
994 & 994 & Robin & Female & 7/24/1987 & 1:35 PM & 100765 & 10982 & 1 &
Client Services \\
995 & 995 & Rose & Female & 8/25/2002 & 5:12 AM & 134505 & 11051 & 1 &
Marketing \\
996 & 996 & Anthony & Male & 10/16/2011 & 8:35 AM & 112769 & 11625 & 1 &
Finance \\
997 & 997 & Tina & Female & 5/15/1997 & 3:53 PM & 56450 & 19.04 & 1 &
Engineering \\
998 & 998 & George & Male & 6/21/2013 & 5:47 PM & 98874 & 4479 & 1 &
Marketing \\
999 & 999 & Henry & nan & 11/23/2014 & 6:09 AM & 132483 & 16655 & 0 &
Distribution \\
1000 & 1000 & Phillip & Male & 1/31/1984 & 6:30 AM & 42392 & 19675 & 0 &
Finance \\
1001 & 1001 & Russell & Male & 5/20/2013 & 12:39 PM & 96914 & 1421 & 0 &
Product \\
1002 & 1002 & Larry & Male & 4/20/2013 & 4:45 PM & 60500 & 11985 & 0 &
Business Development \\
1003 & 1003 & Albert & Male & 5/15/2012 & 6:24 PM & 129949 & 10169 & 1 &
Sales \\
\end{longtable}

    
    Now a \textbf{deep copy} is created to preserve the \textbf{original
data frame} for later \textbf{before-and-after comparison} - including
the new index column to uniquely identify the records.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{19}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{employees\PYZus{}df\PYZus{}orig} \PY{o}{=} \PY{n}{employees\PYZus{}df}\PY{o}{.}\PY{n}{copy}\PY{p}{(}\PY{n}{deep}\PY{o}{=}\PY{k+kc}{True}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    To check for missing values (NaN) in employees dataset, the function
\texttt{isnull()} is used again:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{20}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Highlight cells with NaN values}
\PY{c+c1}{\PYZsh{} HINT: Set to \PYZsq{}False\PYZsq{} when compiling to PDF!}
\PY{n}{highlight} \PY{o}{=} \PY{k+kc}{False}

\PY{k}{if} \PY{n}{highlight}\PY{p}{:}
    \PY{n}{output} \PY{o}{=} \PY{n}{employees\PYZus{}df}\PY{o}{.}\PY{n}{style}\PY{o}{.}\PY{n}{highlight\PYZus{}null}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{yellow}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}
    \PY{n}{display}\PY{p}{(}\PY{n}{output}\PY{p}{)}
\PY{k}{else}\PY{p}{:}
    \PY{n}{str\PYZus{}caption} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Get head and tail of complete employees dataset }\PY{l+s+se}{\PYZbs{}}
\PY{l+s+s1}{                   showing NaN values}\PY{l+s+s1}{\PYZsq{}}
    \PY{n}{func\PYZus{}render\PYZus{}dataframe2Markdown}\PY{p}{(}\PY{n}{employees\PYZus{}df}\PY{o}{.}\PY{n}{head\PYZus{}tail}\PY{p}{(}\PY{l+m+mi}{5}\PY{p}{)}\PY{p}{,} \PY{n}{str\PYZus{}caption}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{longtable}[]{@{}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0448}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0522}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1045}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0746}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1045}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1418}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0746}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0821}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1567}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1642}}@{}}
\caption{Get head and tail of complete employees dataset showing NaN
values}\tabularnewline
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
idx
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
First Name
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Gender
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Start Date
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Last Login Time
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Salary
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Bonus \%
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Senior Management
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Team
\end{minipage} \\
\midrule\noalign{}
\endfirsthead
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
idx
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
First Name
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Gender
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Start Date
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Last Login Time
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Salary
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Bonus \%
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Senior Management
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Team
\end{minipage} \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
0 & 0 & Douglas & Male & 8/6/1993 & 12:42 PM & 97308 & 6945 & True &
Marketing \\
1 & 1 & Thomas & Male & 3/31/1996 & 6:53 AM & 61933 & 4.17 & True &
nan \\
2 & 2 & Maria & Female & 4/23/1993 & 11:17 AM & 130590 & 11858 & False &
Finance \\
3 & 3 & Jerry & Male & 3/4/2005 & 1:00 PM & 138705 & 9.34 & True &
Finance \\
4 & 4 & Larry & Male & 1/24/1998 & 4:47 PM & 101004 & 1389 & True &
Client Services \\
999 & 999 & Henry & nan & 11/23/2014 & 6:09 AM & 132483 & 16655 & False
& Distribution \\
1000 & 1000 & Phillip & Male & 1/31/1984 & 6:30 AM & 42392 & 19675 &
False & Finance \\
1001 & 1001 & Russell & Male & 5/20/2013 & 12:39 PM & 96914 & 1421 &
False & Product \\
1002 & 1002 & Larry & Male & 4/20/2013 & 4:45 PM & 60500 & 11985 & False
& Business Development \\
1003 & 1003 & Albert & Male & 5/15/2012 & 6:24 PM & 129949 & 10169 &
True & Sales \\
\end{longtable}

    
    Show only the \textbf{gaps} (NaN values) from this incomplete dataset
again:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{21}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{employees\PYZus{}df\PYZus{}gaps} \PY{o}{=} \PY{n}{employees\PYZus{}df}\PY{p}{[}\PY{n}{employees\PYZus{}df}\PY{o}{.}\PY{n}{isnull}\PY{p}{(}\PY{p}{)}\PY{o}{.}\PY{n}{any}\PY{p}{(}\PY{n}{axis}\PY{o}{=}\PY{l+m+mi}{1}\PY{p}{)}\PY{p}{]}

\PY{c+c1}{\PYZsh{} Highlight cells with NaN values}
\PY{c+c1}{\PYZsh{} HINT: Set to \PYZsq{}False\PYZsq{} when compiling to PDF!}
\PY{n}{highlight} \PY{o}{=} \PY{k+kc}{False}

\PY{k}{if} \PY{n}{highlight}\PY{p}{:}
    \PY{n}{output} \PY{o}{=} \PY{n}{employees\PYZus{}df\PYZus{}gaps}\PY{o}{.}\PY{n}{style}\PY{o}{.}\PY{n}{highlight\PYZus{}null}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{yellow}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}
    \PY{n}{display}\PY{p}{(}\PY{n}{output}\PY{p}{)}
\PY{k}{else}\PY{p}{:}
    \PY{n}{str\PYZus{}caption} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Get head and tail of employees dataset showing rows }\PY{l+s+se}{\PYZbs{}}
\PY{l+s+s1}{                   with NaN values only}\PY{l+s+s1}{\PYZsq{}}
    \PY{n}{func\PYZus{}render\PYZus{}dataframe2Markdown}\PY{p}{(}\PY{n}{employees\PYZus{}df\PYZus{}gaps}\PY{o}{.}\PY{n}{head\PYZus{}tail}\PY{p}{(}\PY{l+m+mi}{5}\PY{p}{)}\PY{p}{,} \PY{n}{str\PYZus{}caption}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{longtable}[]{@{}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0400}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0560}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1120}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0800}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1120}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1520}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0800}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0880}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1680}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1120}}@{}}
\caption{Get head and tail of employees dataset showing rows with NaN
values only}\tabularnewline
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
idx
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
First Name
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Gender
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Start Date
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Last Login Time
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Salary
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Bonus \%
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Senior Management
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Team
\end{minipage} \\
\midrule\noalign{}
\endfirsthead
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
idx
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
First Name
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Gender
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Start Date
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Last Login Time
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Salary
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Bonus \%
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Senior Management
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Team
\end{minipage} \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
1 & 1 & Thomas & Male & 3/31/1996 & 6:53 AM & 61933 & 4.17 & 1 & nan \\
7 & 7 & nan & Female & 7/20/2015 & 10:43 AM & 45906 & 11598 & nan &
Finance \\
10 & 10 & Louise & Female & 8/12/1980 & 9:01 AM & 63241 & 15132 & 1 &
nan \\
17 & 17 & Shawn & Male & 12/7/1986 & 7:45 PM & nan & 6414 & 0 &
Product \\
20 & 20 & Lois & nan & 4/22/1995 & 7:18 PM & 64714 & 4934 & 1 & Legal \\
965 & 965 & Antonio & nan & 6/18/1989 & 9:37 PM & 103050 & 3.05 & 0 &
Legal \\
976 & 976 & Victor & nan & 7/28/2006 & 2:49 PM & 76381 & 11159 & 1 &
Sales \\
989 & 989 & Stephen & nan & 7/10/1983 & 8:10 PM & 85668 & 1909 & 0 &
Legal \\
993 & 993 & Justin & nan & 2/10/1991 & 4:58 PM & 38344 & 3794 & 0 &
Legal \\
999 & 999 & Henry & nan & 11/23/2014 & 6:09 AM & 132483 & 16655 & 0 &
Distribution \\
\end{longtable}

    
    \hypertarget{fill-in-missing-string-values-with-fillna}{%
\paragraph{\texorpdfstring{Fill in missing \emph{string} values with
\texttt{fillna()}}{Fill in missing string values with fillna()}}\label{fill-in-missing-string-values-with-fillna}}

Now all null values (NaN) in the column ``Gender'' of the data type
String are filled with \emph{``No gender''}.

\textbf{Warning:} The following example replaces the strings directly in
the original dataframe with \texttt{inplace\ =\ True} - no deep copy is
made!

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{22}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Fill all null values in column \PYZsq{}Gender\PYZsq{} using fillna()}
\PY{n}{employees\PYZus{}df}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Gender}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}\PY{o}{.}\PY{n}{fillna}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{No Gender}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{inplace} \PY{o}{=} \PY{k+kc}{True}\PY{p}{)}

\PY{c+c1}{\PYZsh{} Switch to apply highlight style to dataframe}
\PY{c+c1}{\PYZsh{} HINT: Set to \PYZsq{}False\PYZsq{} when compiling to PDF!}
\PY{n}{highlight} \PY{o}{=} \PY{k+kc}{False}

\PY{c+c1}{\PYZsh{} Show only rows with subtituted \PYZsq{}Gender\PYZsq{} column}
\PY{n}{employees\PYZus{}df\PYZus{}filled\PYZus{}gender} \PY{o}{=} \PY{n}{employees\PYZus{}df}\PY{p}{[}\PY{n}{employees\PYZus{}df}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Gender}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]} \PY{o}{==} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{No Gender}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}

\PY{k}{if} \PY{n}{highlight}\PY{p}{:}
    \PY{c+c1}{\PYZsh{} Highlight cells by condition}
    \PY{n}{output} \PY{o}{=} \PY{n}{employees\PYZus{}df\PYZus{}filled\PYZus{}gender}\PY{o}{.}\PY{n}{style}\PY{o}{.}\PY{n}{apply}\PY{p}{(}\PY{k}{lambda} \PY{n}{x}\PY{p}{:} 
                                                   \PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{background: yellow}\PY{l+s+s1}{\PYZsq{}} 
                                                    \PY{k}{if} \PY{n}{v} \PY{o}{==} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{No Gender}\PY{l+s+s1}{\PYZsq{}} 
                                                    \PY{k}{else} \PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{\PYZdq{}} \PY{k}{for} \PY{n}{v} \PY{o+ow}{in} \PY{n}{x}\PY{p}{]}\PY{p}{,} 
                                                   \PY{n}{axis} \PY{o}{=} \PY{l+m+mi}{1}\PY{p}{)}
    \PY{n}{display}\PY{p}{(}\PY{n}{output}\PY{p}{)}
\PY{k}{else}\PY{p}{:}
    \PY{n}{str\PYZus{}caption} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Get head and tail of employees dataset showing rows }\PY{l+s+se}{\PYZbs{}}
\PY{l+s+s1}{                   with subtituted }\PY{l+s+s1}{\PYZdq{}}\PY{l+s+s1}{Gender}\PY{l+s+s1}{\PYZdq{}}\PY{l+s+s1}{ column}\PY{l+s+s1}{\PYZsq{}}
    \PY{n}{func\PYZus{}render\PYZus{}dataframe2Markdown}\PY{p}{(}\PY{n}{employees\PYZus{}df\PYZus{}filled\PYZus{}gender}\PY{o}{.}\PY{n}{head\PYZus{}tail}\PY{p}{(}\PY{l+m+mi}{5}\PY{p}{)}\PY{p}{,}
                                   \PY{n}{str\PYZus{}caption}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{longtable}[]{@{}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0373}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0522}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1045}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0821}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1045}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1418}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0746}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0821}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1567}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1642}}@{}}
\caption{Get head and tail of employees dataset showing rows with
subtituted ``Gender'' column}\tabularnewline
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
idx
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
First Name
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Gender
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Start Date
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Last Login Time
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Salary
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Bonus \%
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Senior Management
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Team
\end{minipage} \\
\midrule\noalign{}
\endfirsthead
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
idx
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
First Name
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Gender
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Start Date
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Last Login Time
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Salary
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Bonus \%
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Senior Management
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Team
\end{minipage} \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
20 & 20 & Lois & No Gender & 4/22/1995 & 7:18 PM & 64714 & 4934 & True &
Legal \\
22 & 22 & Joshua & No Gender & 3/8/2012 & 1:58 AM & 90816 & 18816 & True
& Client Services \\
27 & 27 & Scott & No Gender & 7/11/1991 & 6:58 PM & 122367 & 5218 &
False & Legal \\
31 & 31 & Joyce & No Gender & 2/20/2005 & 2:40 PM & 88657 & 12752 &
False & Product \\
41 & 41 & Christine & No Gender & 6/28/2015 & 1:08 AM & 66582 & 11308 &
True & Business Development \\
965 & 965 & Antonio & No Gender & 6/18/1989 & 9:37 PM & 103050 & 3.05 &
False & Legal \\
976 & 976 & Victor & No Gender & 7/28/2006 & 2:49 PM & 76381 & 11159 &
True & Sales \\
989 & 989 & Stephen & No Gender & 7/10/1983 & 8:10 PM & 85668 & 1909 &
False & Legal \\
993 & 993 & Justin & No Gender & 2/10/1991 & 4:58 PM & 38344 & 3794 &
False & Legal \\
999 & 999 & Henry & No Gender & 11/23/2014 & 6:09 AM & 132483 & 16655 &
False & Distribution \\
\end{longtable}

    
    \hypertarget{fill-in-missing-numerical-values-with-median-values}{%
\paragraph{\texorpdfstring{Fill in missing \emph{numerical} values with
median
values}{Fill in missing numerical values with median values}}\label{fill-in-missing-numerical-values-with-median-values}}

Missing integer or float values can be filled with the \textbf{mean} or
\textbf{median values} of the corresponding feature column.

Often, \textbf{removing individual rows} in the dataset or even entire
feature columns is \textbf{impractical} because too much valuable data
would be lost. In this case, various \textbf{interpolation procedures}
can be used to estimate the missing values based on the other values
present in the data set. The most common interpolation methods are
\textbf{mean} and \textbf{median imputation}, where missing values are
replaced by the \textbf{mean} or \textbf{median} of the entire feature
column (see \cite{ML_ScL_2018}).

Further information on the subject of \textbf{imputation} can be found
here, among other places:

\begin{itemize}
\tightlist
\item
  \href{https://www.statology.org/pandas-fillna-with-median}{Pandas: How
  to Fill NaN Values with Median (3 Examples)}
\item
  \href{https://stackoverflow.com/questions/18689823/pandas-dataframe-replace-nan-values-with-average-of-columns}{pandas
  DataFrame: replace nan values with average of columns}
\item
  \href{https://scikit-learn.org/stable/modules/impute.html}{scikit-learn:
  Imputation of missing values}
\end{itemize}

    Show rows with missing salary only:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{23}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{employees\PYZus{}df\PYZus{}gaps} \PY{o}{=} \PY{n}{employees\PYZus{}df}\PY{p}{[}\PY{n}{employees\PYZus{}df}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Salary}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}\PY{o}{.}\PY{n}{isnull}\PY{p}{(}\PY{p}{)}\PY{p}{]}

\PY{n}{str\PYZus{}caption} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Show rows with missing salary only in employees dataset}\PY{l+s+s1}{\PYZsq{}}
\PY{n}{func\PYZus{}render\PYZus{}dataframe2Markdown}\PY{p}{(}\PY{n}{employees\PYZus{}df\PYZus{}gaps}\PY{p}{,} \PY{n}{str\PYZus{}caption}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{longtable}[]{@{}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0315}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0551}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1102}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0787}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1102}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1496}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0787}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0866}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1654}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1339}}@{}}
\caption{Show rows with missing salary only in employees
dataset}\tabularnewline
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
idx
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
First Name
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Gender
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Start Date
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Last Login Time
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Salary
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Bonus \%
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Senior Management
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Team
\end{minipage} \\
\midrule\noalign{}
\endfirsthead
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
idx
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
First Name
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Gender
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Start Date
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Last Login Time
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Salary
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Bonus \%
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Senior Management
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Team
\end{minipage} \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
17 & 17 & Shawn & Male & 12/7/1986 & 7:45 PM & nan & 6414 & False &
Product \\
63 & 63 & Matthew & Male & 1/2/2013 & 10:33 PM & nan & 18.04 & False &
Human Resources \\
76 & 76 & Margaret & Female & 9/10/1988 & 12:42 PM & nan & 7353 & True &
Distribution \\
\end{longtable}

    
    As will be shown later in the section
\hyperref[display-histogram]{Display Histogram}, the \textbf{salary
structure depends} very much \textbf{on gender}. In order \textbf{not to
distort the dataset} too much, the \textbf{median salaries} are
determined \textbf{gender-specifically}.

First, the \textbf{male} and \textbf{female employees} with missing
salary records are filtered:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{24}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Filter MALE employees with missing salary records}
\PY{n}{employees\PYZus{}df\PYZus{}gaps} \PY{o}{=} \PY{n}{employees\PYZus{}df}\PY{o}{.}\PY{n}{loc}\PY{p}{[}\PY{p}{(}\PY{n}{employees\PYZus{}df}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Gender}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]} \PY{o}{==} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Male}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)} \PYZbs{}
                                     \PY{o}{\PYZam{}} \PY{n}{employees\PYZus{}df}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Salary}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}\PY{o}{.}\PY{n}{isnull}\PY{p}{(}\PY{p}{)}\PY{p}{]}

\PY{c+c1}{\PYZsh{} Get indices of incomplete rows}
\PY{n}{index\PYZus{}list\PYZus{}male\PYZus{}salary\PYZus{}nan} \PY{o}{=} \PY{n}{employees\PYZus{}df\PYZus{}gaps}\PY{o}{.}\PY{n}{index}\PY{o}{.}\PY{n}{to\PYZus{}list}\PY{p}{(}\PY{p}{)}
\PY{n}{index\PYZus{}list\PYZus{}male\PYZus{}salary\PYZus{}nan}
\end{Verbatim}
\end{tcolorbox}

            \begin{tcolorbox}[breakable, size=fbox, boxrule=.5pt, pad at break*=1mm, opacityfill=0]
\prompt{Out}{outcolor}{24}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
[17, 63]
\end{Verbatim}
\end{tcolorbox}
        
    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{25}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Filter FEMALE employees with missing salary records}
\PY{n}{employees\PYZus{}df\PYZus{}gaps} \PY{o}{=} \PY{n}{employees\PYZus{}df}\PY{o}{.}\PY{n}{loc}\PY{p}{[}\PY{p}{(}\PY{n}{employees\PYZus{}df}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Gender}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]} \PY{o}{==} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Female}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)} \PYZbs{}
                                     \PY{o}{\PYZam{}} \PY{n}{employees\PYZus{}df}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Salary}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}\PY{o}{.}\PY{n}{isnull}\PY{p}{(}\PY{p}{)}\PY{p}{]}

\PY{c+c1}{\PYZsh{} Get indices of incomplete rows}
\PY{n}{index\PYZus{}list\PYZus{}female\PYZus{}salary\PYZus{}nan} \PY{o}{=} \PY{n}{employees\PYZus{}df\PYZus{}gaps}\PY{o}{.}\PY{n}{index}\PY{o}{.}\PY{n}{to\PYZus{}list}\PY{p}{(}\PY{p}{)}
\PY{n}{index\PYZus{}list\PYZus{}female\PYZus{}salary\PYZus{}nan}
\end{Verbatim}
\end{tcolorbox}

            \begin{tcolorbox}[breakable, size=fbox, boxrule=.5pt, pad at break*=1mm, opacityfill=0]
\prompt{Out}{outcolor}{25}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
[76]
\end{Verbatim}
\end{tcolorbox}
        
    Get \textbf{median salaries} gender-specifically:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{26}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Get median salary for MALE employees}
\PY{n}{employees\PYZus{}df\PYZus{}male} \PY{o}{=} \PY{n}{employees\PYZus{}df}\PY{o}{.}\PY{n}{loc}\PY{p}{[}\PY{n}{employees\PYZus{}df}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Gender}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]} \PY{o}{==} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Male}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}

\PY{n}{salary\PYZus{}median\PYZus{}male} \PY{o}{=} \PY{n}{employees\PYZus{}df\PYZus{}male}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Salary}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}\PY{o}{.}\PY{n}{median}\PY{p}{(}\PY{p}{)}
\PY{n}{salary\PYZus{}median\PYZus{}male}
\end{Verbatim}
\end{tcolorbox}

            \begin{tcolorbox}[breakable, size=fbox, boxrule=.5pt, pad at break*=1mm, opacityfill=0]
\prompt{Out}{outcolor}{26}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
90370.0
\end{Verbatim}
\end{tcolorbox}
        
    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{27}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Get median salary for FEMALE employees}
\PY{n}{employees\PYZus{}df\PYZus{}female} \PY{o}{=} \PY{n}{employees\PYZus{}df}\PY{o}{.}\PY{n}{loc}\PY{p}{[}\PY{n}{employees\PYZus{}df}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Gender}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]} \PY{o}{==} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Female}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}

\PY{n}{salary\PYZus{}median\PYZus{}female} \PY{o}{=} \PY{n}{employees\PYZus{}df\PYZus{}female}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Salary}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}\PY{o}{.}\PY{n}{median}\PY{p}{(}\PY{p}{)}
\PY{n}{salary\PYZus{}median\PYZus{}female}
\end{Verbatim}
\end{tcolorbox}

            \begin{tcolorbox}[breakable, size=fbox, boxrule=.5pt, pad at break*=1mm, opacityfill=0]
\prompt{Out}{outcolor}{27}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
90032.5
\end{Verbatim}
\end{tcolorbox}
        
    Fill missing salary records with the \textbf{gender-specifically median}
of the salary column and replace NaN values in the original dataframe:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{28}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Fill missing salary records by MALE indices}
\PY{n}{employees\PYZus{}df}\PY{o}{.}\PY{n}{loc}\PY{p}{[}\PY{n}{index\PYZus{}list\PYZus{}male\PYZus{}salary\PYZus{}nan}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Salary}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]} \PY{o}{=} \PY{n}{salary\PYZus{}median\PYZus{}male}

\PY{n}{str\PYZus{}caption} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Show filled missing salary records in dataset for male employees}\PY{l+s+s1}{\PYZsq{}}
\PY{n}{func\PYZus{}render\PYZus{}dataframe2Markdown}\PY{p}{(}\PY{n}{employees\PYZus{}df}\PY{o}{.}\PY{n}{loc}\PY{p}{[}\PY{n}{index\PYZus{}list\PYZus{}male\PYZus{}salary\PYZus{}nan}\PY{p}{]}\PY{p}{,}
                               \PY{n}{str\PYZus{}caption}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{longtable}[]{@{}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0315}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0551}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1102}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0787}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1102}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1496}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0787}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0866}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1654}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1339}}@{}}
\caption{Show filled missing salary records in dataset for male
employees}\tabularnewline
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
idx
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
First Name
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Gender
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Start Date
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Last Login Time
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Salary
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Bonus \%
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Senior Management
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Team
\end{minipage} \\
\midrule\noalign{}
\endfirsthead
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
idx
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
First Name
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Gender
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Start Date
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Last Login Time
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Salary
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Bonus \%
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Senior Management
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Team
\end{minipage} \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
17 & 17 & Shawn & Male & 12/7/1986 & 7:45 PM & 90370 & 6414 & False &
Product \\
63 & 63 & Matthew & Male & 1/2/2013 & 10:33 PM & 90370 & 18.04 & False &
Human Resources \\
\end{longtable}

    
    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{29}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Fill missing salary records by FEMALE indices}
\PY{n}{employees\PYZus{}df}\PY{o}{.}\PY{n}{loc}\PY{p}{[}\PY{n}{index\PYZus{}list\PYZus{}female\PYZus{}salary\PYZus{}nan}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Salary}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]} \PY{o}{=} \PY{n}{salary\PYZus{}median\PYZus{}female}

\PY{n}{str\PYZus{}caption} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Show filled missing salary records in dataset for female employees}\PY{l+s+s1}{\PYZsq{}}
\PY{n}{func\PYZus{}render\PYZus{}dataframe2Markdown}\PY{p}{(}\PY{n}{employees\PYZus{}df}\PY{o}{.}\PY{n}{loc}\PY{p}{[}\PY{n}{index\PYZus{}list\PYZus{}female\PYZus{}salary\PYZus{}nan}\PY{p}{]}\PY{p}{,}
                               \PY{n}{str\PYZus{}caption}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{longtable}[]{@{}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0323}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0565}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1129}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0806}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1129}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1532}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0806}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0887}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1694}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1129}}@{}}
\caption{Show filled missing salary records in dataset for female
employees}\tabularnewline
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
idx
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
First Name
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Gender
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Start Date
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Last Login Time
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Salary
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Bonus \%
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Senior Management
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Team
\end{minipage} \\
\midrule\noalign{}
\endfirsthead
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
idx
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
First Name
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Gender
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Start Date
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Last Login Time
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Salary
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Bonus \%
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Senior Management
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Team
\end{minipage} \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
76 & 76 & Margaret & Female & 9/10/1988 & 12:42 PM & 90032.5 & 7353 &
True & Distribution \\
\end{longtable}

    
    \textbf{Merge} male and female index lists and \textbf{extend} the
indices by its direct \textbf{neighbors} for later
\textbf{before-and-after comparison}:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{30}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Function for retrieving the indices of previous and next rows of a dataframe}
\PY{k}{def} \PY{n+nf}{get\PYZus{}previous\PYZus{}and\PYZus{}next\PYZus{}rows\PYZus{}from\PYZus{}df}\PY{p}{(}\PY{n}{df}\PY{p}{,} \PY{n}{search\PYZus{}idx}\PY{p}{)}\PY{p}{:}
    \PY{c+c1}{\PYZsh{} Get list of indices of dataframe rows}
    \PY{n}{index\PYZus{}list} \PY{o}{=} \PY{n}{df}\PY{o}{.}\PY{n}{index}\PY{o}{.}\PY{n}{to\PYZus{}list}\PY{p}{(}\PY{p}{)}

    \PY{n}{li\PYZus{}elem\PYZus{}df\PYZus{}prev} \PY{o}{=} \PY{l+m+mi}{0}
    \PY{n}{li\PYZus{}elem\PYZus{}df\PYZus{}next} \PY{o}{=} \PY{l+m+mi}{0}
    \PY{n}{b\PYZus{}element\PYZus{}found} \PY{o}{=} \PY{k+kc}{False}

    \PY{c+c1}{\PYZsh{} Cycle over list of indices of given dataframe}
    \PY{k}{for} \PY{n}{li\PYZus{}idx\PYZus{}df}\PY{p}{,} \PY{n}{li\PYZus{}elem\PYZus{}df} \PY{o+ow}{in} \PY{n+nb}{enumerate}\PY{p}{(}\PY{n}{index\PYZus{}list}\PY{p}{)}\PY{p}{:}
        \PY{c+c1}{\PYZsh{} Check index bounds}
        \PY{k}{if} \PY{p}{(}\PY{n}{li\PYZus{}idx\PYZus{}df}\PY{o}{+}\PY{l+m+mi}{1} \PY{o}{\PYZlt{}} \PY{n+nb}{len}\PY{p}{(}\PY{n}{index\PYZus{}list}\PY{p}{)} \PY{o+ow}{and} \PY{n}{li\PYZus{}idx\PYZus{}df}\PY{o}{\PYZhy{}}\PY{l+m+mi}{1} \PY{o}{\PYZgt{}}\PY{o}{=} \PY{l+m+mi}{0}\PY{p}{)}\PY{p}{:}

            \PY{c+c1}{\PYZsh{} Get previous and successing list elements}
            \PY{k}{if} \PY{n}{li\PYZus{}elem\PYZus{}df} \PY{o}{==} \PY{n}{search\PYZus{}idx}\PY{p}{:}
                \PY{n}{b\PYZus{}element\PYZus{}found} \PY{o}{=} \PY{k+kc}{True}
                \PY{n}{li\PYZus{}elem\PYZus{}df\PYZus{}prev} \PY{o}{=} \PY{n}{index\PYZus{}list}\PY{p}{[}\PY{n}{li\PYZus{}idx\PYZus{}df}\PY{o}{\PYZhy{}}\PY{l+m+mi}{1}\PY{p}{]}
                \PY{n}{li\PYZus{}elem\PYZus{}df\PYZus{}next} \PY{o}{=} \PY{n}{index\PYZus{}list}\PY{p}{[}\PY{n}{li\PYZus{}idx\PYZus{}df}\PY{o}{+}\PY{l+m+mi}{1}\PY{p}{]}

    \PY{k}{if} \PY{n}{b\PYZus{}element\PYZus{}found}\PY{p}{:}
        \PY{k}{return} \PY{p}{[}\PY{n}{li\PYZus{}elem\PYZus{}df\PYZus{}prev}\PY{p}{,} \PY{n}{search\PYZus{}idx}\PY{p}{,} \PY{n}{li\PYZus{}elem\PYZus{}df\PYZus{}next}\PY{p}{]}
    \PY{k}{else}\PY{p}{:}
        \PY{n+nb}{print}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Element was not found :(}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}
        \PY{k}{return} \PY{p}{[}\PY{p}{]}
\end{Verbatim}
\end{tcolorbox}

    Generate a \textbf{combined index list} of males and females with the
missing salary records as well as their respective direct \textbf{gender
neighbors} for comparison:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{31}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Merge MALE and FEMALE index lists}
\PY{n}{index\PYZus{}list\PYZus{}merged} \PY{o}{=} \PY{p}{[}\PY{p}{]}

\PY{c+c1}{\PYZsh{} Filter MALE employees}
\PY{n}{employees\PYZus{}df\PYZus{}male} \PY{o}{=} \PY{n}{employees\PYZus{}df}\PY{o}{.}\PY{n}{loc}\PY{p}{[}\PY{p}{(}\PY{n}{employees\PYZus{}df}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Gender}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]} \PY{o}{==} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Male}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}\PY{p}{]}

\PY{c+c1}{\PYZsh{} Cycle over list of MALE employees with missing salary records}
\PY{k}{for} \PY{n}{li\PYZus{}idx\PYZus{}salary\PYZus{}nan} \PY{o+ow}{in} \PY{n}{index\PYZus{}list\PYZus{}male\PYZus{}salary\PYZus{}nan}\PY{p}{:}
    \PY{n}{li\PYZus{}neighbors\PYZus{}male} \PY{o}{=} \PY{n}{get\PYZus{}previous\PYZus{}and\PYZus{}next\PYZus{}rows\PYZus{}from\PYZus{}df}\PY{p}{(}\PY{n}{employees\PYZus{}df\PYZus{}male}\PY{p}{,} 
                                                           \PY{n}{li\PYZus{}idx\PYZus{}salary\PYZus{}nan}\PY{p}{)}
    \PY{n}{index\PYZus{}list\PYZus{}merged}\PY{o}{.}\PY{n}{extend}\PY{p}{(}\PY{n}{li\PYZus{}neighbors\PYZus{}male}\PY{p}{)}

\PY{c+c1}{\PYZsh{} Filter FEMALE employees}
\PY{n}{employees\PYZus{}df\PYZus{}female} \PY{o}{=} \PY{n}{employees\PYZus{}df}\PY{o}{.}\PY{n}{loc}\PY{p}{[}\PY{p}{(}\PY{n}{employees\PYZus{}df}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Gender}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]} \PY{o}{==} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Female}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}\PY{p}{]}

\PY{c+c1}{\PYZsh{} Cycle over list of FEMALE employees with missing salary records}
\PY{k}{for} \PY{n}{li\PYZus{}idx\PYZus{}salary\PYZus{}nan} \PY{o+ow}{in} \PY{n}{index\PYZus{}list\PYZus{}female\PYZus{}salary\PYZus{}nan}\PY{p}{:}
    \PY{n}{li\PYZus{}neighbors\PYZus{}female} \PY{o}{=} \PY{n}{get\PYZus{}previous\PYZus{}and\PYZus{}next\PYZus{}rows\PYZus{}from\PYZus{}df}\PY{p}{(}\PY{n}{employees\PYZus{}df\PYZus{}female}\PY{p}{,} 
                                                           \PY{n}{li\PYZus{}idx\PYZus{}salary\PYZus{}nan}\PY{p}{)}
    \PY{n}{index\PYZus{}list\PYZus{}merged}\PY{o}{.}\PY{n}{extend}\PY{p}{(}\PY{n}{li\PYZus{}neighbors\PYZus{}female}\PY{p}{)}

\PY{n}{index\PYZus{}list\PYZus{}merged}
\end{Verbatim}
\end{tcolorbox}

            \begin{tcolorbox}[breakable, size=fbox, boxrule=.5pt, pad at break*=1mm, opacityfill=0]
\prompt{Out}{outcolor}{31}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
[16, 17, 21, 57, 63, 65, 75, 76, 78]
\end{Verbatim}
\end{tcolorbox}
        
    Show rows with replaced \texttt{salary} by extended index list for both
genders (including the direct neighbors) for \textbf{before-and-after
comparison}.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{32}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Switch to apply highlight style to dataframe}
\PY{c+c1}{\PYZsh{} HINT: Set to \PYZsq{}False\PYZsq{} when compiling to PDF!}
\PY{n}{highlight} \PY{o}{=} \PY{k+kc}{False}

\PY{c+c1}{\PYZsh{} Show rows with replaced \PYZsq{}salary\PYZsq{} by index and its direct neighbors}
\PY{n}{employees\PYZus{}df\PYZus{}filled\PYZus{}salary} \PY{o}{=} \PY{n}{employees\PYZus{}df}\PY{o}{.}\PY{n}{iloc}\PY{p}{[}\PY{n}{index\PYZus{}list\PYZus{}merged}\PY{p}{]}

\PY{k}{if} \PY{n}{highlight}\PY{p}{:}
    \PY{c+c1}{\PYZsh{} Highlight cells by condition}
    \PY{n}{output} \PY{o}{=} \PY{n}{employees\PYZus{}df\PYZus{}filled\PYZus{}salary}\PY{o}{.}\PY{n}{style}\PY{o}{.}\PY{n}{apply}\PY{p}{(}\PY{k}{lambda} \PY{n}{x}\PY{p}{:} 
                                       \PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{background: yellow}\PY{l+s+s1}{\PYZsq{}} 
                                        \PY{k}{if} \PY{p}{(}\PY{n}{v} \PY{o}{==} \PY{n}{salary\PYZus{}median\PYZus{}male} 
                                            \PY{o+ow}{or} \PY{n}{v} \PY{o}{==} \PY{n}{salary\PYZus{}median\PYZus{}female}\PY{p}{)} 
                                        \PY{k}{else} \PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{\PYZdq{}} \PY{k}{for} \PY{n}{v} \PY{o+ow}{in} \PY{n}{x}\PY{p}{]}\PY{p}{,} 
                                       \PY{n}{axis} \PY{o}{=} \PY{l+m+mi}{1}\PY{p}{)}
    \PY{n}{display}\PY{p}{(}\PY{n}{output}\PY{p}{)}
\PY{k}{else}\PY{p}{:}
    \PY{n}{str\PYZus{}caption} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Show rows with replaced salary by extended index list }\PY{l+s+se}{\PYZbs{}}
\PY{l+s+s1}{                   for both genders and their neighbors}\PY{l+s+s1}{\PYZsq{}}
    \PY{n}{func\PYZus{}render\PYZus{}dataframe2Markdown}\PY{p}{(}\PY{n}{employees\PYZus{}df\PYZus{}filled\PYZus{}salary}\PY{p}{,} \PY{n}{str\PYZus{}caption}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{longtable}[]{@{}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0315}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0551}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1102}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0787}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1102}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1496}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0787}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0866}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1654}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1339}}@{}}
\caption{Show rows with replaced salary by extended index list for both
genders and their neighbors}\tabularnewline
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
idx
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
First Name
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Gender
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Start Date
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Last Login Time
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Salary
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Bonus \%
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Senior Management
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Team
\end{minipage} \\
\midrule\noalign{}
\endfirsthead
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
idx
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
First Name
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Gender
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Start Date
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Last Login Time
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Salary
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Bonus \%
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Senior Management
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Team
\end{minipage} \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
16 & 16 & Jeremy & Male & 9/21/2010 & 5:56 AM & 90370 & 7369 & False &
Human Resources \\
17 & 17 & Shawn & Male & 12/7/1986 & 7:45 PM & 90370 & 6414 & False &
Product \\
21 & 21 & Matthew & Male & 9/5/1995 & 2:12 AM & 100612 & 13645 & False &
Marketing \\
57 & 57 & Henry & Male & 6/26/1996 & 1:44 AM & 64715 & 15107 & True &
Human Resources \\
63 & 63 & Matthew & Male & 1/2/2013 & 10:33 PM & 90370 & 18.04 & False &
Human Resources \\
65 & 65 & Steve & Male & 11/11/2009 & 11:44 PM & 61310 & 12428 & True &
Distribution \\
75 & 75 & Bonnie & Female & 7/2/1991 & 1:27 AM & 104897 & 5118 & True &
Human Resources \\
76 & 76 & Margaret & Female & 9/10/1988 & 12:42 PM & 90032.5 & 7353 &
True & Distribution \\
78 & 78 & Robin & Female & 6/4/1983 & 3:15 PM & 114797 & 5965 & True &
Sales \\
\end{longtable}

    
    By pure coincidence, the \textbf{predecessor} of \texttt{Shawn} with the
missing salary record has the \textbf{median male salary} of the entire
dataset. Therefore, this value is also highlighted.

    \hypertarget{drop-missing-values-using-dropna}{%
\paragraph{\texorpdfstring{Drop missing values using
\texttt{dropna()}}{Drop missing values using dropna()}}\label{drop-missing-values-using-dropna}}

In order to drop null values from a dataframe, we use \texttt{dropna()}
function. This function drops rows or columns of datasets with NaN
values in different ways.

Default is to drop rows with at least 1 null value (NaN). Giving the
parameter \texttt{how\ =\ \textquotesingle{}all\textquotesingle{}} the
function drops rows with all data missing or contain null values (NaN).

\textbf{Warning:} The following example drops rows with missing values
(NaN) directly in the original dataframe with \texttt{inplace\ =\ True}
- no deep copy is made!

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{33}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Drop rows with missing values directly in the original dataframe}
\PY{n}{employees\PYZus{}df}\PY{o}{.}\PY{n}{dropna}\PY{p}{(}\PY{n}{axis} \PY{o}{=} \PY{l+m+mi}{0}\PY{p}{,} \PY{n}{how} \PY{o}{=}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{any}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{inplace} \PY{o}{=} \PY{k+kc}{True}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    Finally we compare the sizes of dataframes so that we learn how many
rows had at least 1 Null value.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{34}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n+nb}{print}\PY{p}{(}\PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{Original dataframe length:}\PY{l+s+s2}{\PYZdq{}}\PY{p}{,} \PY{n+nb}{len}\PY{p}{(}\PY{n}{employees\PYZus{}df\PYZus{}orig}\PY{p}{)}\PY{p}{)}
\PY{n+nb}{print}\PY{p}{(}\PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{New dataframe length:}\PY{l+s+s2}{\PYZdq{}}\PY{p}{,} \PY{n+nb}{len}\PY{p}{(}\PY{n}{employees\PYZus{}df}\PY{p}{)}\PY{p}{)}
\PY{n+nb}{print}\PY{p}{(}\PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{Number of rows with at least 1 NaN value: }\PY{l+s+s2}{\PYZdq{}}\PY{p}{,} 
      \PY{p}{(}\PY{n+nb}{len}\PY{p}{(}\PY{n}{employees\PYZus{}df\PYZus{}orig}\PY{p}{)}\PY{o}{\PYZhy{}}\PY{n+nb}{len}\PY{p}{(}\PY{n}{employees\PYZus{}df}\PY{p}{)}\PY{p}{)}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{Verbatim}[commandchars=\\\{\}]
Original dataframe length: 1004
New dataframe length: 903
Number of rows with at least 1 NaN value:  101
    \end{Verbatim}

    \hypertarget{find-and-remove-duplicates-in-dataset}{%
\subsubsection{Find and remove duplicates in
dataset}\label{find-and-remove-duplicates-in-dataset}}

This section was inspired by:

\begin{itemize}
\tightlist
\item
  \href{https://www.statology.org/pandas-find-duplicates/}{How to Find
  Duplicates in Pandas DataFrame (With Examples)}
\item
  \href{https://www.statology.org/pandas-drop-duplicates/}{How to Drop
  Duplicate Rows in a Pandas DataFrame}
\end{itemize}

    \hypertarget{check-for-duplicate-values-using-duplicated}{%
\paragraph{\texorpdfstring{Check for duplicate values using
\texttt{duplicated()}}{Check for duplicate values using duplicated()}}\label{check-for-duplicate-values-using-duplicated}}

In order to check for duplicate values in \texttt{pandas.DataFrame}, we
use a function \texttt{duplicated()}. This function can be used in two
ways:

\begin{itemize}
\tightlist
\item
  find duplicate rows across \textbf{all columns} with
  \texttt{df.duplicated()}
\item
  find duplicate rows across \textbf{specific columns} with parameter
  \texttt{subset={[}\textquotesingle{}col1\textquotesingle{},\ \textquotesingle{}col2\textquotesingle{}{]}}
\item
  mark last duplicates for removing and \textbf{keep the first
  occurrences} with parameter
  \texttt{keep=\textquotesingle{}first\textquotesingle{}}
\item
  mark first duplicates for removing and \textbf{keep the last
  occurrences} with parameter
  \texttt{keep=\textquotesingle{}last\textquotesingle{}}
\item
  \textbf{mark all duplicates} for removing with parameter
  \texttt{keep=False}
\end{itemize}

Find duplicate rows across \textbf{all columns}:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{35}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Find duplicate rows across all columns}
\PY{c+c1}{\PYZsh{} The column \PYZsq{}idx\PYZsq{} has to be ignored}
\PY{n}{column\PYZus{}subset} \PY{o}{=} \PY{n}{employees\PYZus{}df}\PY{o}{.}\PY{n}{columns}\PY{o}{.}\PY{n}{difference}\PY{p}{(}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{idx}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}\PY{p}{)}
\PY{n}{duplicateRows} \PY{o}{=} \PY{n}{employees\PYZus{}df}\PY{p}{[}\PY{n}{employees\PYZus{}df}\PY{o}{.}\PY{n}{duplicated}\PY{p}{(}\PY{n}{subset}\PY{o}{=}\PY{n}{column\PYZus{}subset}\PY{p}{)}\PY{p}{]}

\PY{n}{str\PYZus{}caption} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Find duplicate rows across all columns}\PY{l+s+s1}{\PYZsq{}}
\PY{n}{func\PYZus{}render\PYZus{}dataframe2Markdown}\PY{p}{(}\PY{n}{duplicateRows}\PY{p}{,} \PY{n}{str\PYZus{}caption}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{longtable}[]{@{}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0373}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0522}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1045}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0821}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1045}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1418}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0746}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0821}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1567}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1642}}@{}}
\caption{Find duplicate rows across all columns}\tabularnewline
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
idx
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
First Name
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Gender
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Start Date
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Last Login Time
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Salary
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Bonus \%
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Senior Management
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Team
\end{minipage} \\
\midrule\noalign{}
\endfirsthead
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
idx
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
First Name
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Gender
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Start Date
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Last Login Time
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Salary
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Bonus \%
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Senior Management
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Team
\end{minipage} \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
112 & 112 & Karen & Female & 11/30/1999 & 7:46 AM & 102488 & 17653 &
True & Product \\
127 & 127 & Linda & Female & 5/25/2000 & 5:45 PM & 119009 & 12506 & True
& Business Development \\
296 & 296 & Brandon & No Gender & 11/3/1997 & 8:17 PM & 121333 & 15295 &
False & Business Development \\
580 & 580 & Nicholas & Male & 3/1/2013 & 9:26 PM & 101036 & 2826 & True
& Human Resources \\
\end{longtable}

    
    Find \textbf{all completely identical duplicates} (first and last
occurrences). The resulting dataframe is \textbf{sorted by column}
\texttt{\textquotesingle{}First\ Name\textquotesingle{}} to get the
\textbf{duplicates grouped}:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{36}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Parameter \PYZsq{}keep=False\PYZsq{} displays all duplicate rows}
\PY{n}{duplicateRows} \PY{o}{=} \PY{n}{employees\PYZus{}df}\PY{p}{[}\PY{n}{employees\PYZus{}df}\PY{o}{.}\PY{n}{duplicated}\PY{p}{(}\PY{n}{keep}\PY{o}{=}\PY{k+kc}{False}\PY{p}{,} 
                                                     \PY{n}{subset}\PY{o}{=}\PY{n}{column\PYZus{}subset}\PY{p}{)}\PY{p}{]}

\PY{c+c1}{\PYZsh{} Sort rows by column \PYZsq{}First Name\PYZsq{} to get the duplicates grouped}
\PY{n}{str\PYZus{}caption} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Find all completely identical duplicates and group }\PY{l+s+se}{\PYZbs{}}
\PY{l+s+s1}{               by column }\PY{l+s+s1}{\PYZdq{}}\PY{l+s+s1}{First Name}\PY{l+s+s1}{\PYZdq{}}\PY{l+s+s1}{\PYZsq{}}
\PY{n}{func\PYZus{}render\PYZus{}dataframe2Markdown}\PY{p}{(}\PY{n}{duplicateRows}\PY{o}{.}\PY{n}{sort\PYZus{}values}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{First Name}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}\PY{p}{,} \PY{n}{str\PYZus{}caption}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{longtable}[]{@{}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0373}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0522}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1045}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0821}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1045}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1418}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0746}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0821}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1567}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1642}}@{}}
\caption{Find all completely identical duplicates and group by column
``First Name''}\tabularnewline
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
idx
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
First Name
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Gender
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Start Date
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Last Login Time
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Salary
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Bonus \%
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Senior Management
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Team
\end{minipage} \\
\midrule\noalign{}
\endfirsthead
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
idx
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
First Name
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Gender
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Start Date
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Last Login Time
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Salary
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Bonus \%
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Senior Management
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Team
\end{minipage} \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
153 & 153 & Brandon & No Gender & 11/3/1997 & 8:17 PM & 121333 & 15295 &
False & Business Development \\
296 & 296 & Brandon & No Gender & 11/3/1997 & 8:17 PM & 121333 & 15295 &
False & Business Development \\
55 & 55 & Karen & Female & 11/30/1999 & 7:46 AM & 102488 & 17653 & True
& Product \\
112 & 112 & Karen & Female & 11/30/1999 & 7:46 AM & 102488 & 17653 &
True & Product \\
92 & 92 & Linda & Female & 5/25/2000 & 5:45 PM & 119009 & 12506 & True &
Business Development \\
127 & 127 & Linda & Female & 5/25/2000 & 5:45 PM & 119009 & 12506 & True
& Business Development \\
442 & 442 & Nicholas & Male & 3/1/2013 & 9:26 PM & 101036 & 2826 & True
& Human Resources \\
580 & 580 & Nicholas & Male & 3/1/2013 & 9:26 PM & 101036 & 2826 & True
& Human Resources \\
\end{longtable}

    
    Find \textbf{all duplicates} (first and last occurrences) across
\textbf{specific columns}. The resulting dataframe is \textbf{sorted by
multiple columns}
\texttt{\textquotesingle{}First\ Name\textquotesingle{}} and
\texttt{\textquotesingle{}Last\ Login\ Time\textquotesingle{}} to get
the \textbf{duplicates grouped}:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{37}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Parameter \PYZsq{}keep=False\PYZsq{} displays all duplicate rows}
\PY{n}{duplicateRows} \PY{o}{=} \PY{n}{employees\PYZus{}df}\PY{p}{[}\PY{n}{employees\PYZus{}df}\PY{o}{.}\PY{n}{duplicated}\PY{p}{(}
                    \PY{n}{subset}\PY{o}{=}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{First Name}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Last Login Time}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}\PY{p}{,} \PY{n}{keep}\PY{o}{=}\PY{k+kc}{False}\PY{p}{)}\PY{p}{]}

\PY{c+c1}{\PYZsh{} Sort rows by multiple columns \PYZsq{}First Name\PYZsq{} and \PYZsq{}Last Login Time\PYZsq{} to get }
\PY{c+c1}{\PYZsh{} the duplicates grouped}
\PY{n}{str\PYZus{}caption} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Find all completely identical duplicates and group }\PY{l+s+se}{\PYZbs{}}
\PY{l+s+s1}{               by multiple columns }\PY{l+s+s1}{\PYZdq{}}\PY{l+s+s1}{First Name}\PY{l+s+s1}{\PYZdq{}}\PY{l+s+s1}{ and }\PY{l+s+s1}{\PYZdq{}}\PY{l+s+s1}{Last Login Time}\PY{l+s+s1}{\PYZdq{}}\PY{l+s+s1}{\PYZsq{}}
\PY{n}{func\PYZus{}render\PYZus{}dataframe2Markdown}\PY{p}{(}\PY{n}{duplicateRows}\PY{o}{.}\PY{n}{sort\PYZus{}values}\PY{p}{(}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{First Name}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} 
                                                          \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Last Login Time}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}\PY{p}{)}\PY{p}{,} 
                               \PY{n}{str\PYZus{}caption}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{longtable}[]{@{}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0373}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0522}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1045}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0821}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1045}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1418}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0746}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0821}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1567}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1642}}@{}}
\caption{Find all completely identical duplicates and group by multiple
columns ``First Name'' and ``Last Login Time''}\tabularnewline
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
idx
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
First Name
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Gender
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Start Date
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Last Login Time
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Salary
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Bonus \%
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Senior Management
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Team
\end{minipage} \\
\midrule\noalign{}
\endfirsthead
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
idx
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
First Name
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Gender
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Start Date
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Last Login Time
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Salary
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Bonus \%
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Senior Management
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Team
\end{minipage} \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
153 & 153 & Brandon & No Gender & 11/3/1997 & 8:17 PM & 121333 & 15295 &
False & Business Development \\
296 & 296 & Brandon & No Gender & 11/3/1997 & 8:17 PM & 121333 & 15295 &
False & Business Development \\
55 & 55 & Karen & Female & 11/30/1999 & 7:46 AM & 102488 & 17653 & True
& Product \\
112 & 112 & Karen & Female & 11/30/1999 & 7:46 AM & 102488 & 17653 &
True & Product \\
92 & 92 & Linda & Female & 5/25/2000 & 5:45 PM & 119009 & 12506 & True &
Business Development \\
127 & 127 & Linda & Female & 5/25/2000 & 5:45 PM & 119009 & 12506 & True
& Business Development \\
37 & 37 & Linda & Female & 10/19/1981 & 8:49 PM & 57427 & 9557 & True &
Client Services \\
973 & 973 & Linda & Female & 2/4/2010 & 8:49 PM & 44486 & 17308 & True &
Engineering \\
66 & 66 & Nancy & Female & 12/15/2012 & 11:57 PM & 125250 & 2672 & True
& Business Development \\
934 & 934 & Nancy & Female & 9/10/2001 & 11:57 PM & 85213 & 2386 & True
& Marketing \\
442 & 442 & Nicholas & Male & 3/1/2013 & 9:26 PM & 101036 & 2826 & True
& Human Resources \\
580 & 580 & Nicholas & Male & 3/1/2013 & 9:26 PM & 101036 & 2826 & True
& Human Resources \\
\end{longtable}

    
    \hypertarget{drop-duplicate-values-using-drop_duplicates}{%
\paragraph{\texorpdfstring{Drop duplicate values using
\texttt{drop\_duplicates()}}{Drop duplicate values using drop\_duplicates()}}\label{drop-duplicate-values-using-drop_duplicates}}

In order to drop duplicate values from a dataframe, we use
\texttt{drop\_duplicates()} function.

This function can be used in two ways:

\begin{itemize}
\tightlist
\item
  remove duplicate rows across \textbf{all columns} with
  \texttt{df.drop\_duplicates()}
\item
  remove duplicate rows across \textbf{specific columns} with parameter
  \texttt{subset={[}\textquotesingle{}col1\textquotesingle{},\ \textquotesingle{}col2\textquotesingle{}{]}}
\end{itemize}

\textbf{Warning:} The following example replaces the strings directly in
the original dataframe with \texttt{inplace\ =\ True} - no deep copy is
made!

Remove duplicate rows across \textbf{all columns}:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{38}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Remove duplicate rows across all columns}
\PY{c+c1}{\PYZsh{} The column \PYZsq{}idx\PYZsq{} has to be ignored}
\PY{n}{column\PYZus{}subset} \PY{o}{=} \PY{n}{employees\PYZus{}df}\PY{o}{.}\PY{n}{columns}\PY{o}{.}\PY{n}{difference}\PY{p}{(}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{idx}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}\PY{p}{)}
\PY{n}{employees\PYZus{}df}\PY{o}{.}\PY{n}{drop\PYZus{}duplicates}\PY{p}{(}\PY{n}{inplace}\PY{o}{=}\PY{k+kc}{True}\PY{p}{,} \PY{n}{subset}\PY{o}{=}\PY{n}{column\PYZus{}subset}\PY{p}{)}

\PY{n}{str\PYZus{}caption} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Show dataset after removing all completely identical duplicates}\PY{l+s+s1}{\PYZsq{}}
\PY{n}{func\PYZus{}render\PYZus{}dataframe2Markdown}\PY{p}{(}\PY{n}{employees\PYZus{}df}\PY{o}{.}\PY{n}{head\PYZus{}tail}\PY{p}{(}\PY{l+m+mi}{5}\PY{p}{)}\PY{p}{,} \PY{n}{str\PYZus{}caption}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{longtable}[]{@{}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0444}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0519}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1037}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0815}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1037}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1407}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0741}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0815}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1556}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1630}}@{}}
\caption{Show dataset after removing all completely identical
duplicates}\tabularnewline
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
idx
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
First Name
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Gender
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Start Date
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Last Login Time
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Salary
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Bonus \%
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Senior Management
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Team
\end{minipage} \\
\midrule\noalign{}
\endfirsthead
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
idx
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
First Name
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Gender
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Start Date
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Last Login Time
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Salary
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Bonus \%
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Senior Management
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Team
\end{minipage} \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
0 & 0 & Douglas & Male & 8/6/1993 & 12:42 PM & 97308 & 6945 & True &
Marketing \\
2 & 2 & Maria & Female & 4/23/1993 & 11:17 AM & 130590 & 11858 & False &
Finance \\
3 & 3 & Jerry & Male & 3/4/2005 & 1:00 PM & 138705 & 9.34 & True &
Finance \\
4 & 4 & Larry & Male & 1/24/1998 & 4:47 PM & 101004 & 1389 & True &
Client Services \\
5 & 5 & Dennis & Male & 4/18/1987 & 1:35 AM & 115163 & 10125 & False &
Legal \\
999 & 999 & Henry & No Gender & 11/23/2014 & 6:09 AM & 132483 & 16655 &
False & Distribution \\
1000 & 1000 & Phillip & Male & 1/31/1984 & 6:30 AM & 42392 & 19675 &
False & Finance \\
1001 & 1001 & Russell & Male & 5/20/2013 & 12:39 PM & 96914 & 1421 &
False & Product \\
1002 & 1002 & Larry & Male & 4/20/2013 & 4:45 PM & 60500 & 11985 & False
& Business Development \\
1003 & 1003 & Albert & Male & 5/15/2012 & 6:24 PM & 129949 & 10169 &
True & Sales \\
\end{longtable}

    
    Remove duplicate rows across \textbf{specific columns}:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{39}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Remove duplicate rows across \PYZsq{}First Name\PYZsq{} and \PYZsq{}Last Login Time\PYZsq{} columns}
\PY{n}{employees\PYZus{}df}\PY{o}{.}\PY{n}{drop\PYZus{}duplicates}\PY{p}{(}
    \PY{n}{subset}\PY{o}{=}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{First Name}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Last Login Time}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}\PY{p}{,} \PY{n}{keep}\PY{o}{=}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{last}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{inplace}\PY{o}{=}\PY{k+kc}{True}\PY{p}{)}

\PY{n}{str\PYZus{}caption} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Show dataset after removing duplicate rows across }\PY{l+s+se}{\PYZbs{}}
\PY{l+s+s1}{               columns }\PY{l+s+s1}{\PYZdq{}}\PY{l+s+s1}{First Name}\PY{l+s+s1}{\PYZdq{}}\PY{l+s+s1}{ and }\PY{l+s+s1}{\PYZdq{}}\PY{l+s+s1}{Last Login Time}\PY{l+s+s1}{\PYZdq{}}\PY{l+s+s1}{\PYZsq{}}
\PY{n}{func\PYZus{}render\PYZus{}dataframe2Markdown}\PY{p}{(}\PY{n}{employees\PYZus{}df}\PY{o}{.}\PY{n}{head\PYZus{}tail}\PY{p}{(}\PY{l+m+mi}{5}\PY{p}{)}\PY{p}{,} \PY{n}{str\PYZus{}caption}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{longtable}[]{@{}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0444}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0519}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1037}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0815}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1037}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1407}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0741}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0815}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1556}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1630}}@{}}
\caption{Show dataset after removing duplicate rows across columns
``First Name'' and ``Last Login Time''}\tabularnewline
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
idx
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
First Name
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Gender
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Start Date
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Last Login Time
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Salary
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Bonus \%
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Senior Management
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Team
\end{minipage} \\
\midrule\noalign{}
\endfirsthead
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
idx
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
First Name
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Gender
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Start Date
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Last Login Time
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Salary
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Bonus \%
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Senior Management
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Team
\end{minipage} \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
0 & 0 & Douglas & Male & 8/6/1993 & 12:42 PM & 97308 & 6945 & True &
Marketing \\
2 & 2 & Maria & Female & 4/23/1993 & 11:17 AM & 130590 & 11858 & False &
Finance \\
3 & 3 & Jerry & Male & 3/4/2005 & 1:00 PM & 138705 & 9.34 & True &
Finance \\
4 & 4 & Larry & Male & 1/24/1998 & 4:47 PM & 101004 & 1389 & True &
Client Services \\
5 & 5 & Dennis & Male & 4/18/1987 & 1:35 AM & 115163 & 10125 & False &
Legal \\
999 & 999 & Henry & No Gender & 11/23/2014 & 6:09 AM & 132483 & 16655 &
False & Distribution \\
1000 & 1000 & Phillip & Male & 1/31/1984 & 6:30 AM & 42392 & 19675 &
False & Finance \\
1001 & 1001 & Russell & Male & 5/20/2013 & 12:39 PM & 96914 & 1421 &
False & Product \\
1002 & 1002 & Larry & Male & 4/20/2013 & 4:45 PM & 60500 & 11985 & False
& Business Development \\
1003 & 1003 & Albert & Male & 5/15/2012 & 6:24 PM & 129949 & 10169 &
True & Sales \\
\end{longtable}

    
    \hypertarget{compare-the-edited-dataset-with-the-original-dataset-side-by-side}{%
\subsubsection{Compare the edited dataset with the original dataset
side-by-side}\label{compare-the-edited-dataset-with-the-original-dataset-side-by-side}}

In the previous sections, the dataframe holding the \textbf{employees
dataset} was \textbf{heavily edited} by \textbf{adding missing values}
(where it was appropriate) or \textbf{deleting gapped rows} completely.
Therefore, the \textbf{modifications made} to the dataset should be
finally \textbf{checked}.

The \texttt{Pandas} package provides the \texttt{compare()} function,
which can be used to \textbf{compare dataframes} and \textbf{display
differences} (see here:
\href{https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.compare.html}{pandas.DataFrame.compare}).
Unfortunately, the documentation points out that this function can
\textbf{only} be used to \textbf{compare dataframes} with the
\textbf{same shape} (number of columns and rows) and \textbf{identical
row and column labels}.

Among other things, a lot of \textbf{gapped rows have been removed} due
to the heavy editing. Therefore, the original and the edited dataframe
are \textbf{anything but identical} in terms of their \textbf{number of
rows}. As expected, the following short test ends with a \textbf{long
error message}.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{40}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Pandas.compare() does not work with dropped rows in one dataframe!}
\PY{c+c1}{\PYZsh{}employees\PYZus{}df\PYZus{}orig.compare(employees\PYZus{}df, keep\PYZus{}shape=False, keep\PYZus{}equal=True)}
\end{Verbatim}
\end{tcolorbox}

    Therefore, in this subsection, a \textbf{customized method} of
\textbf{comparing two dataframes} that are \textbf{not identical} in
terms of \textbf{shape} has been developed.

The following approach shows one way to \textbf{compare} the
\textbf{modified dataframe} with the \textbf{original dataframe} -
despite the different number of rows. This is done by \textbf{merging
both dataframes} into one, displaying the \textbf{same columns} of both
datasets \textbf{side-by-side} and highlighting the \textbf{changes}.

The following sources were the inspiration for this subsection:

\begin{itemize}
\tightlist
\item
  \href{https://stackoverflow.com/questions/17095101/compare-two-dataframes-and-output-their-differences-side-by-side/47112033\#47112033}{Compare
  two DataFrames and output their differences side-by-side}
\item
  \href{https://stackoverflow.com/questions/71604701/pandas-compare-two-data-frames-and-highlight-the-differences/71617662\#71617662}{pandas
  compare two data frames and highlight the differences}
\item
  \href{https://www.skytowner.com/explore/highlighting_a_particular_cell_of_a_dataframe_in_pandas}{Highlighting
  a particular cell of a DataFrame in Pandas}
\item
  \href{https://datascientyst.com/compare-two-pandas-dataframes-get-differences/}{How
  to Compare Two Pandas DataFrames and Get Differences}
\item
  \href{https://stackoverflow.com/questions/70566100/how-to-highlight-differences-between-the-two-data-frames-in-pandas}{How
  to highlight differences between the two data frames in pandas}
\item
  \href{https://pandas.pydata.org/docs/dev/reference/api/pandas.DataFrame.compare.html}{pandas.DataFrame.compare}
\item
  \href{https://pandas.pydata.org/pandas-docs/stable/user_guide/style.html}{Pandas
  Table Visualization}
\item
  \href{https://towardsdatascience.com/10-examples-to-master-pandas-styler-408ea794e91}{10
  Examples to Master Pandas Styler}
\item
  \href{https://towardsdatascience.com/style-pandas-dataframe-like-a-master-6b02bf6468b0}{Style
  Pandas Dataframe Like a Master}
\item
  \href{https://towardsdatascience.com/a-quick-and-easy-guide-to-conditional-formatting-in-pandas-8783035071ee}{A
  Quick and Easy Guide to Conditional Formatting in Pandas}
\end{itemize}

    First, the \textbf{original} and the \textbf{modified dataframes} are
\textbf{merged} into a new dataframe, using the \textbf{index column}
\texttt{idx} to \textbf{synchronize} the two dataframes.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{41}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{employees\PYZus{}df\PYZus{}merged} \PY{o}{=} \PY{n}{pd}\PY{o}{.}\PY{n}{merge}\PY{p}{(}\PY{n}{employees\PYZus{}df\PYZus{}orig}\PY{p}{,} 
                               \PY{n}{employees\PYZus{}df}\PY{p}{,} 
                               \PY{n}{how}\PY{o}{=}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{left}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{on}\PY{o}{=}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{idx}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    During the merge operation \textbf{column suffixes} \texttt{\_x} (first)
and \texttt{\_y} (second dataframe) are automatically added.

For a better comprehensibility these suffixes are \textbf{renamed} to
\texttt{orig} (original) and \texttt{edit} (edited dataframe) using
\textbf{lambda inline functions}.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{42}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{employees\PYZus{}df\PYZus{}merged}\PY{o}{.}\PY{n}{rename}\PY{p}{(}\PY{n}{columns}\PY{o}{=}\PY{k}{lambda} \PY{n}{x}\PY{p}{:} \PY{n}{x}\PY{o}{.}\PY{n}{replace}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{\PYZus{}x}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{ orig}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}\PY{p}{,} \PY{n}{inplace}\PY{o}{=}\PY{k+kc}{True}\PY{p}{)}
\PY{n}{employees\PYZus{}df\PYZus{}merged}\PY{o}{.}\PY{n}{rename}\PY{p}{(}\PY{n}{columns}\PY{o}{=}\PY{k}{lambda} \PY{n}{x}\PY{p}{:} \PY{n}{x}\PY{o}{.}\PY{n}{replace}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{\PYZus{}y}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{ edit}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}\PY{p}{,} \PY{n}{inplace}\PY{o}{=}\PY{k+kc}{True}\PY{p}{)}

\PY{c+c1}{\PYZsh{} Print columnname as a list}
\PY{n}{employees\PYZus{}df\PYZus{}merged}\PY{o}{.}\PY{n}{columns}\PY{o}{.}\PY{n}{to\PYZus{}list}\PY{p}{(}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

            \begin{tcolorbox}[breakable, size=fbox, boxrule=.5pt, pad at break*=1mm, opacityfill=0]
\prompt{Out}{outcolor}{42}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
['idx',
 'First Name orig',
 'Gender orig',
 'Start Date orig',
 'Last Login Time orig',
 'Salary orig',
 'Bonus \% orig',
 'Senior Management orig',
 'Team orig',
 'First Name edit',
 'Gender edit',
 'Start Date edit',
 'Last Login Time edit',
 'Salary edit',
 'Bonus \% edit',
 'Senior Management edit',
 'Team edit']
\end{Verbatim}
\end{tcolorbox}
        
    In order to have the \textbf{columns} of the original and the edited
dataframe directly \textbf{next to each other} for a \textbf{better
comparison}, a \textbf{column list} is created with the \textbf{new
order}.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{43}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{li\PYZus{}reordered\PYZus{}cols} \PY{o}{=} \PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{idx}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}

\PY{c+c1}{\PYZsh{} Iterate over columns}
\PY{k}{for} \PY{n}{column} \PY{o+ow}{in} \PY{n}{employees\PYZus{}df\PYZus{}orig}\PY{o}{.}\PY{n}{columns}\PY{p}{:}
    \PY{k}{if} \PY{n}{column} \PY{o}{!=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{idx}\PY{l+s+s1}{\PYZsq{}}\PY{p}{:}
        \PY{c+c1}{\PYZsh{} Create new order of column names}
        \PY{n}{li\PYZus{}reordered\PYZus{}cols}\PY{o}{.}\PY{n}{append}\PY{p}{(}\PY{n}{column} \PY{o}{+} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{ orig}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}
        \PY{n}{li\PYZus{}reordered\PYZus{}cols}\PY{o}{.}\PY{n}{append}\PY{p}{(}\PY{n}{column} \PY{o}{+} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{ edit}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}

\PY{n}{li\PYZus{}reordered\PYZus{}cols}
\end{Verbatim}
\end{tcolorbox}

            \begin{tcolorbox}[breakable, size=fbox, boxrule=.5pt, pad at break*=1mm, opacityfill=0]
\prompt{Out}{outcolor}{43}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
['idx',
 'First Name orig',
 'First Name edit',
 'Gender orig',
 'Gender edit',
 'Start Date orig',
 'Start Date edit',
 'Last Login Time orig',
 'Last Login Time edit',
 'Salary orig',
 'Salary edit',
 'Bonus \% orig',
 'Bonus \% edit',
 'Senior Management orig',
 'Senior Management edit',
 'Team orig',
 'Team edit']
\end{Verbatim}
\end{tcolorbox}
        
    In order to have \textbf{columns with the same meaning next to each
other}, the columns of the merged data frame are \textbf{re-sorted}
based on the new \textbf{column list}.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{44}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{employees\PYZus{}df\PYZus{}merged} \PY{o}{=} \PY{n}{employees\PYZus{}df\PYZus{}merged}\PY{o}{.}\PY{n}{reindex}\PY{p}{(}\PY{n}{columns}\PY{o}{=}\PY{n}{li\PYZus{}reordered\PYZus{}cols}\PY{p}{)}

\PY{c+c1}{\PYZsh{} Due to the limitation of the page width, the dataframe is displayed in 2 parts}
\PY{n}{li\PYZus{}cols2display\PYZus{}01} \PY{o}{=} \PY{p}{[}\PY{p}{]}
\PY{n}{li\PYZus{}cols2display\PYZus{}02} \PY{o}{=} \PY{p}{[}\PY{p}{]}

\PY{c+c1}{\PYZsh{} Iterate over columns}
\PY{k}{for} \PY{n}{column} \PY{o+ow}{in} \PY{n}{employees\PYZus{}df\PYZus{}merged}\PY{o}{.}\PY{n}{columns}\PY{p}{:}
    \PY{k}{if} \PY{n}{column} \PY{o}{!=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{idx}\PY{l+s+s1}{\PYZsq{}}\PY{p}{:}
        \PY{c+c1}{\PYZsh{} Select column names}
        \PY{k}{if} \PY{p}{(}\PY{n}{column}\PY{o}{.}\PY{n}{startswith}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{First Name}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)} \PY{o+ow}{or}
            \PY{n}{column}\PY{o}{.}\PY{n}{startswith}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Gender}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)} \PY{o+ow}{or}
            \PY{n}{column}\PY{o}{.}\PY{n}{startswith}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Start Date}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)} \PY{o+ow}{or}
            \PY{n}{column}\PY{o}{.}\PY{n}{startswith}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Last Login Time}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}\PY{p}{)}\PY{p}{:}
            
            \PY{n}{li\PYZus{}cols2display\PYZus{}01}\PY{o}{.}\PY{n}{append}\PY{p}{(}\PY{n}{column}\PY{p}{)}
        \PY{k}{elif} \PY{p}{(}\PY{n}{column}\PY{o}{.}\PY{n}{startswith}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Salary}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)} \PY{o+ow}{or}
              \PY{n}{column}\PY{o}{.}\PY{n}{startswith}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Bonus }\PY{l+s+s1}{\PYZpc{}}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)} \PY{o+ow}{or}
              \PY{n}{column}\PY{o}{.}\PY{n}{startswith}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Senior Management}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)} \PY{o+ow}{or}
              \PY{n}{column}\PY{o}{.}\PY{n}{startswith}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Team}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}\PY{p}{)}\PY{p}{:}
            
            \PY{n}{li\PYZus{}cols2display\PYZus{}02}\PY{o}{.}\PY{n}{append}\PY{p}{(}\PY{n}{column}\PY{p}{)}

\PY{n}{str\PYZus{}caption} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Show part 1 of the merged and reordered dataframe}\PY{l+s+s1}{\PYZsq{}}
\PY{n}{func\PYZus{}render\PYZus{}dataframe2Markdown}\PY{p}{(}\PY{n}{employees\PYZus{}df\PYZus{}merged}\PY{p}{[}\PY{n}{li\PYZus{}cols2display\PYZus{}01}\PY{p}{]}\PY{o}{.}\PY{n}{head\PYZus{}tail}\PY{p}{(}\PY{l+m+mi}{5}\PY{p}{)}\PY{p}{,} 
                               \PY{n}{str\PYZus{}caption}\PY{p}{)}

\PY{n}{str\PYZus{}caption} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Show part 2 of the merged and reordered dataframe}\PY{l+s+s1}{\PYZsq{}}
\PY{n}{func\PYZus{}render\PYZus{}dataframe2Markdown}\PY{p}{(}\PY{n}{employees\PYZus{}df\PYZus{}merged}\PY{p}{[}\PY{n}{li\PYZus{}cols2display\PYZus{}02}\PY{p}{]}\PY{o}{.}\PY{n}{head\PYZus{}tail}\PY{p}{(}\PY{l+m+mi}{5}\PY{p}{)}\PY{p}{,} 
                               \PY{n}{str\PYZus{}caption}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{longtable}[]{@{}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 16\tabcolsep) * \real{0.0375}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 16\tabcolsep) * \real{0.1188}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 16\tabcolsep) * \real{0.1188}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 16\tabcolsep) * \real{0.0938}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 16\tabcolsep) * \real{0.0938}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 16\tabcolsep) * \real{0.1188}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 16\tabcolsep) * \real{0.1188}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 16\tabcolsep) * \real{0.1500}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 16\tabcolsep) * \real{0.1500}}@{}}
\caption{Show part 1 of the merged and reordered
dataframe}\tabularnewline
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
First Name orig
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
First Name edit
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Gender orig
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Gender edit
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Start Date orig
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Start Date edit
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Last Login Time orig
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Last Login Time edit
\end{minipage} \\
\midrule\noalign{}
\endfirsthead
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
First Name orig
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
First Name edit
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Gender orig
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Gender edit
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Start Date orig
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Start Date edit
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Last Login Time orig
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Last Login Time edit
\end{minipage} \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
0 & Douglas & Douglas & Male & Male & 8/6/1993 & 8/6/1993 & 12:42 PM &
12:42 PM \\
1 & Thomas & nan & Male & nan & 3/31/1996 & nan & 6:53 AM & nan \\
2 & Maria & Maria & Female & Female & 4/23/1993 & 4/23/1993 & 11:17 AM &
11:17 AM \\
3 & Jerry & Jerry & Male & Male & 3/4/2005 & 3/4/2005 & 1:00 PM & 1:00
PM \\
4 & Larry & Larry & Male & Male & 1/24/1998 & 1/24/1998 & 4:47 PM & 4:47
PM \\
999 & Henry & Henry & nan & No Gender & 11/23/2014 & 11/23/2014 & 6:09
AM & 6:09 AM \\
1000 & Phillip & Phillip & Male & Male & 1/31/1984 & 1/31/1984 & 6:30 AM
& 6:30 AM \\
1001 & Russell & Russell & Male & Male & 5/20/2013 & 5/20/2013 & 12:39
PM & 12:39 PM \\
1002 & Larry & Larry & Male & Male & 4/20/2013 & 4/20/2013 & 4:45 PM &
4:45 PM \\
1003 & Albert & Albert & Male & Male & 5/15/2012 & 5/15/2012 & 6:24 PM &
6:24 PM \\
\end{longtable}

    
    \begin{longtable}[]{@{}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 16\tabcolsep) * \real{0.0366}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 16\tabcolsep) * \real{0.0915}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 16\tabcolsep) * \real{0.0915}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 16\tabcolsep) * \real{0.0976}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 16\tabcolsep) * \real{0.0976}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 16\tabcolsep) * \real{0.1585}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 16\tabcolsep) * \real{0.1585}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 16\tabcolsep) * \real{0.1341}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 16\tabcolsep) * \real{0.1341}}@{}}
\caption{Show part 2 of the merged and reordered
dataframe}\tabularnewline
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Salary orig
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Salary edit
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Bonus \% orig
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Bonus \% edit
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Senior Management orig
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Senior Management edit
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Team orig
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Team edit
\end{minipage} \\
\midrule\noalign{}
\endfirsthead
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Salary orig
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Salary edit
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Bonus \% orig
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Bonus \% edit
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Senior Management orig
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
Senior Management edit
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Team orig
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Team edit
\end{minipage} \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
0 & 97308 & 97308 & 6945 & 6945 & True & 1 & Marketing & Marketing \\
1 & 61933 & nan & 4.17 & nan & True & nan & nan & nan \\
2 & 130590 & 130590 & 11858 & 11858 & False & 0 & Finance & Finance \\
3 & 138705 & 138705 & 9.34 & 9.34 & True & 1 & Finance & Finance \\
4 & 101004 & 101004 & 1389 & 1389 & True & 1 & Client Services & Client
Services \\
999 & 132483 & 132483 & 16655 & 16655 & False & 0 & Distribution &
Distribution \\
1000 & 42392 & 42392 & 19675 & 19675 & False & 0 & Finance & Finance \\
1001 & 96914 & 96914 & 1421 & 1421 & False & 0 & Product & Product \\
1002 & 60500 & 60500 & 11985 & 11985 & False & 0 & Business Development
& Business Development \\
1003 & 129949 & 129949 & 10169 & 10169 & True & 1 & Sales & Sales \\
\end{longtable}

    
    To avoid problems with the subsequent comparison, \textbf{missing
values} in the cells are \textbf{replaced} with the string \texttt{NaN}
using the function \texttt{fillna()}.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{45}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{employees\PYZus{}df\PYZus{}merged}\PY{o}{.}\PY{n}{fillna}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{NaN}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{inplace} \PY{o}{=} \PY{k+kc}{True}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    For a better overview, only the \textbf{rows} that show
\textbf{differences in the individual cells} will be displayed during
the comparison. For this purpose, the new, empty dataframe
\texttt{employees\_df\_diff} is created with the same column labels as
the original dataframe \texttt{employees\_df\_orig}.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{46}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{employees\PYZus{}df\PYZus{}diff} \PY{o}{=} \PY{n}{pd}\PY{o}{.}\PY{n}{DataFrame}\PY{p}{(}\PY{n}{columns}\PY{o}{=}\PY{n}{employees\PYZus{}df\PYZus{}orig}\PY{o}{.}\PY{n}{columns}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    The function \texttt{dataframe\_add\_row()} is used to easily
\textbf{add rows} to a \textbf{dataframe}.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{47}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{k}{def} \PY{n+nf}{dataframe\PYZus{}add\PYZus{}row}\PY{p}{(}\PY{n}{df}\PY{o}{=}\PY{k+kc}{None}\PY{p}{,} \PY{n}{row}\PY{o}{=}\PY{p}{[}\PY{p}{]}\PY{p}{)}\PY{p}{:}
    \PY{k}{if} \PY{p}{(}\PY{n}{df} \PY{o+ow}{is} \PY{k+kc}{None}\PY{p}{)}\PY{p}{:}
        \PY{k}{return}
    
    \PY{c+c1}{\PYZsh{} Add a row to dataframe}
    \PY{n}{df}\PY{o}{.}\PY{n}{loc}\PY{p}{[}\PY{o}{\PYZhy{}}\PY{l+m+mi}{1}\PY{p}{]} \PY{o}{=} \PY{n}{row}
    
    \PY{c+c1}{\PYZsh{} Shift the index of the dataframe}
    \PY{n}{df}\PY{o}{.}\PY{n}{index} \PY{o}{=} \PY{n}{df}\PY{o}{.}\PY{n}{index} \PY{o}{+} \PY{l+m+mi}{1}
    
    \PY{c+c1}{\PYZsh{} Reset the index of dataframe and }
    \PY{c+c1}{\PYZsh{} avoid the old index being added as a column}
    \PY{n}{df}\PY{o}{.}\PY{n}{reset\PYZus{}index}\PY{p}{(}\PY{n}{drop}\PY{o}{=}\PY{k+kc}{True}\PY{p}{,} \PY{n}{inplace}\PY{o}{=}\PY{k+kc}{True}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    Now the merged dataframe \texttt{employees\_df\_merged} is
\textbf{iterated} through \textbf{row by row} and \textbf{column by
column} in a \textbf{nested loop}. In the inner loop, the
\textbf{original cell value} (column with suffix \texttt{orig}) is
\textbf{compared} with the adjacent \textbf{edited cell value} (column
with suffix \texttt{edit}).

If a \textbf{difference} is detected, the \textbf{original} and the
\textbf{edited cell value} is written into a \textbf{common new cell} -
marked with the \textbf{difference symbol}
\texttt{\textless{}=\textgreater{}}. Then, only the \textbf{rows
containing differences} in the cells are \textbf{added} to the
\texttt{employees\_df\_diff} \textbf{dataframe}.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{48}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{b\PYZus{}diffs\PYZus{}found} \PY{o}{=} \PY{k+kc}{False}

\PY{c+c1}{\PYZsh{} Iterate over rows}
\PY{k}{for} \PY{n}{rowIndex}\PY{p}{,} \PY{n}{row} \PY{o+ow}{in} \PY{n}{employees\PYZus{}df\PYZus{}merged}\PY{o}{.}\PY{n}{iterrows}\PY{p}{(}\PY{p}{)}\PY{p}{:}
    \PY{c+c1}{\PYZsh{} Iterate over columns}
    \PY{k}{for} \PY{n}{column}\PY{p}{,} \PY{n}{value} \PY{o+ow}{in} \PY{n}{row}\PY{o}{.}\PY{n}{items}\PY{p}{(}\PY{p}{)}\PY{p}{:}
        \PY{c+c1}{\PYZsh{} Omit index column for comparison}
        \PY{k}{if} \PY{n}{column} \PY{o}{==} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{idx}\PY{l+s+s1}{\PYZsq{}}\PY{p}{:}
            \PY{n}{row\PYZus{}li} \PY{o}{=} \PY{p}{[}\PY{n}{value}\PY{p}{]}
            \PY{k}{continue}
        \PY{k}{if} \PY{n}{column}\PY{o}{.}\PY{n}{endswith}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{ orig}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}\PY{p}{:}
            \PY{n}{value\PYZus{}orig} \PY{o}{=} \PY{n}{value}
        \PY{k}{elif} \PY{n}{column}\PY{o}{.}\PY{n}{endswith}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{ edit}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}\PY{p}{:}
            \PY{n}{value\PYZus{}edit} \PY{o}{=} \PY{n}{value}
            \PY{k}{if} \PY{n}{value\PYZus{}orig} \PY{o}{!=} \PY{n}{value\PYZus{}edit}\PY{p}{:}
                \PY{c+c1}{\PYZsh{} Combine original and edited value and mark with diff symbol \PYZsq{}\PYZlt{}=\PYZgt{}\PYZsq{}}
                \PY{n}{row\PYZus{}li}\PY{o}{.}\PY{n}{append}\PY{p}{(}\PY{n+nb}{str}\PY{p}{(}\PY{n}{value\PYZus{}orig}\PY{p}{)} \PY{o}{+} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{ \PYZlt{}=\PYZgt{} }\PY{l+s+s1}{\PYZsq{}} \PY{o}{+} \PY{n+nb}{str}\PY{p}{(}\PY{n}{value\PYZus{}edit}\PY{p}{)}\PY{p}{)}
                \PY{n}{b\PYZus{}diffs\PYZus{}found} \PY{o}{=} \PY{k+kc}{True}
            \PY{k}{elif} \PY{n}{value\PYZus{}orig} \PY{o}{==} \PY{n}{value\PYZus{}edit}\PY{p}{:}
                \PY{n}{row\PYZus{}li}\PY{o}{.}\PY{n}{append}\PY{p}{(}\PY{n+nb}{str}\PY{p}{(}\PY{n}{value\PYZus{}orig}\PY{p}{)}\PY{p}{)}
    \PY{c+c1}{\PYZsh{} Add new row to dataframe when differences found only}
    \PY{k}{if} \PY{n}{b\PYZus{}diffs\PYZus{}found}\PY{p}{:}
        \PY{n}{dataframe\PYZus{}add\PYZus{}row}\PY{p}{(}\PY{n}{employees\PYZus{}df\PYZus{}diff}\PY{p}{,} \PY{n}{row\PYZus{}li}\PY{p}{)}
        \PY{n}{b\PYZus{}diffs\PYZus{}found} \PY{o}{=} \PY{k+kc}{False}

\PY{c+c1}{\PYZsh{} Add suffix to the column names to visualize the original and the edited values}
\PY{n}{employees\PYZus{}df\PYZus{}diff}\PY{o}{.}\PY{n}{rename}\PY{p}{(}\PY{n}{columns}\PY{o}{=}\PY{k}{lambda} \PY{n}{x} \PY{p}{:} \PY{n}{x}\PY{o}{+}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{ o\PYZlt{}=\PYZgt{}e}\PY{l+s+s1}{\PYZsq{}} \PY{k}{if} \PY{n}{x} \PY{o}{!=}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{idx}\PY{l+s+s1}{\PYZsq{}} \PY{k}{else} \PY{n}{x}\PY{p}{,} 
                         \PY{n}{inplace}\PY{o}{=}\PY{k+kc}{True}\PY{p}{)}

\PY{c+c1}{\PYZsh{}employees\PYZus{}df\PYZus{}diff}
\end{Verbatim}
\end{tcolorbox}

    Finally, the \textbf{differences} can be visualized even more
prominently by \textbf{highlighting} the \textbf{cell backgrounds in
color}. For this purpose the function \texttt{style.apply()} provided in
\texttt{Pandas} is used. An \textbf{inline lambda function} searches the
cells for the difference symbol \texttt{\textless{}=\textgreater{}} and
\textbf{highlights} the \textbf{cell} with yellow color.

This allows to achieve a \textbf{similar functionality} as known from
\textbf{spreadsheet programs} (e.g.~Open Office Calc) as so-called
\textbf{``conditional formatting''}.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{49}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Switch to apply highlight style to dataframe}
\PY{c+c1}{\PYZsh{} HINT: Set to \PYZsq{}False\PYZsq{} when compiling to PDF!}
\PY{n}{highlight} \PY{o}{=} \PY{k+kc}{False}

\PY{k}{if} \PY{n}{highlight}\PY{p}{:}
    \PY{c+c1}{\PYZsh{} Highlight cells by condition}
    \PY{n}{output} \PY{o}{=} \PY{n}{employees\PYZus{}df\PYZus{}diff}\PY{o}{.}\PY{n}{style}\PY{o}{.}\PY{n}{apply}\PY{p}{(}\PY{k}{lambda} \PY{n}{x}\PY{p}{:} 
                                          \PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{background: yellow}\PY{l+s+s1}{\PYZsq{}} 
                                           \PY{k}{if} \PY{n+nb}{str}\PY{p}{(}\PY{n}{v}\PY{p}{)}\PY{o}{.}\PY{n}{find}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{\PYZlt{}=\PYZgt{}}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)} \PY{o}{!=} \PY{o}{\PYZhy{}}\PY{l+m+mi}{1}
                                           \PY{k}{else} \PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{\PYZdq{}} \PY{k}{for} \PY{n}{v} \PY{o+ow}{in} \PY{n}{x}\PY{p}{]}\PY{p}{,} 
                                          \PY{n}{axis} \PY{o}{=} \PY{l+m+mi}{1}\PY{p}{)}
    \PY{n}{display}\PY{p}{(}\PY{n}{output}\PY{p}{)}
\PY{k}{else}\PY{p}{:}
    \PY{n}{str\PYZus{}caption} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Visualize differences between the original and }\PY{l+s+se}{\PYZbs{}}
\PY{l+s+s1}{                   the edited employees dataset}\PY{l+s+s1}{\PYZsq{}}
    \PY{n}{func\PYZus{}render\PYZus{}dataframe2Markdown}\PY{p}{(}\PY{n}{employees\PYZus{}df\PYZus{}diff}\PY{o}{.}\PY{n}{head\PYZus{}tail}\PY{p}{(}\PY{l+m+mi}{6}\PY{p}{)}\PY{p}{,} \PY{n}{str\PYZus{}caption}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{longtable}[]{@{}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0286}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0400}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1143}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1086}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1143}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1429}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1029}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0971}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.1543}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 18\tabcolsep) * \real{0.0971}}@{}}
\caption{Visualize differences between the original and the edited
employees dataset}\tabularnewline
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
idx
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
First Name o\textless=\textgreater e
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Gender o\textless=\textgreater e
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Start Date o\textless=\textgreater e
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Last Login Time o\textless=\textgreater e
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Salary o\textless=\textgreater e
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Bonus \% o\textless=\textgreater e
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Senior Management o\textless=\textgreater e
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Team o\textless=\textgreater e
\end{minipage} \\
\midrule\noalign{}
\endfirsthead
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
idx
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
First Name o\textless=\textgreater e
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Gender o\textless=\textgreater e
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Start Date o\textless=\textgreater e
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Last Login Time o\textless=\textgreater e
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Salary o\textless=\textgreater e
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Bonus \% o\textless=\textgreater e
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Senior Management o\textless=\textgreater e
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
Team o\textless=\textgreater e
\end{minipage} \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
0 & 1 & Thomas \textless=\textgreater{} NaN & Male
\textless=\textgreater{} NaN & 3/31/1996 \textless=\textgreater{} NaN &
6:53 AM \textless=\textgreater{} NaN & 61933.0 \textless=\textgreater{}
NaN & 4.17 \textless=\textgreater{} NaN & True \textless=\textgreater{}
NaN & NaN \\
1 & 7 & NaN & Female \textless=\textgreater{} NaN & 7/20/2015
\textless=\textgreater{} NaN & 10:43 AM \textless=\textgreater{} NaN &
45906.0 \textless=\textgreater{} NaN & 11598.0 \textless=\textgreater{}
NaN & NaN & Finance \textless=\textgreater{} NaN \\
2 & 10 & Louise \textless=\textgreater{} NaN & Female
\textless=\textgreater{} NaN & 8/12/1980 \textless=\textgreater{} NaN &
9:01 AM \textless=\textgreater{} NaN & 63241.0 \textless=\textgreater{}
NaN & 15132.0 \textless=\textgreater{} NaN & True
\textless=\textgreater{} NaN & NaN \\
3 & 17 & Shawn & Male & 12/7/1986 & 7:45 PM & NaN
\textless=\textgreater{} 90370.0 & 6414.0 & False & Product \\
4 & 20 & Lois & NaN \textless=\textgreater{} No Gender & 4/22/1995 &
7:18 PM & 64714.0 & 4934.0 & True & Legal \\
5 & 22 & Joshua & NaN \textless=\textgreater{} No Gender & 3/8/2012 &
1:58 AM & 90816.0 & 18816.0 & True & Client Services \\
239 & 955 & NaN & Female \textless=\textgreater{} NaN & 9/14/2010
\textless=\textgreater{} NaN & 5:19 AM \textless=\textgreater{} NaN &
143638.0 \textless=\textgreater{} NaN & 9662.0 \textless=\textgreater{}
NaN & NaN & NaN \\
240 & 965 & Antonio & NaN \textless=\textgreater{} No Gender & 6/18/1989
& 9:37 PM & 103050.0 & 3.05 & False & Legal \\
241 & 976 & Victor & NaN \textless=\textgreater{} No Gender & 7/28/2006
& 2:49 PM & 76381.0 & 11159.0 & True & Sales \\
242 & 989 & Stephen & NaN \textless=\textgreater{} No Gender & 7/10/1983
& 8:10 PM & 85668.0 & 1909.0 & False & Legal \\
243 & 993 & Justin & NaN \textless=\textgreater{} No Gender & 2/10/1991
& 4:58 PM & 38344.0 & 3794.0 & False & Legal \\
244 & 999 & Henry & NaN \textless=\textgreater{} No Gender & 11/23/2014
& 6:09 AM & 132483.0 & 16655.0 & False & Distribution \\
\end{longtable}

    
    \hypertarget{save-edited-dataset-to-new-csv-file}{%
\subsubsection{Save edited dataset to new CSV
file}\label{save-edited-dataset-to-new-csv-file}}

After the anomalies in the dataset have been found - and repaired where
appropriate - it can be saved as a new CSV file for later use.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{50}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{csv\PYZus{}filepath} \PY{o}{=} \PY{l+s+sa}{r}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{./datasets/employees\PYZus{}edit\PYZus{}repaired.csv}\PY{l+s+s1}{\PYZsq{}}

\PY{n}{employees\PYZus{}df}\PY{o}{.}\PY{n}{to\PYZus{}csv}\PY{p}{(}\PY{n}{csv\PYZus{}filepath}\PY{p}{,} \PY{n}{sep} \PY{o}{=}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{,}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{index} \PY{o}{=} \PY{k+kc}{False}\PY{p}{,} \PY{n}{header}\PY{o}{=}\PY{k+kc}{True}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \hypertarget{avoidance-of-tendencies-due-to-bias}{%
\subsection{\texorpdfstring{Avoidance of \textbf{tendencies due to
bias}}{Avoidance of tendencies due to bias}}\label{avoidance-of-tendencies-due-to-bias}}

The description of the Iris dataset says, that it consists of \textbf{50
samples} from \textbf{each of three species} of Iris (\emph{Iris
setosa}, \emph{Iris virginica} and \emph{Iris versicolor}), so there are
\textbf{150 samples in total}.

But how can this be verified? The following subsections provide some
\textbf{ideas} on how to \textbf{examine the dataset} for
\textbf{tendencies} as a \textbf{cause of bias}.

\hypertarget{count-occurrences-of-unique-values}{%
\subsubsection{Count occurrences of unique
values}\label{count-occurrences-of-unique-values}}

To prove whether all possible classes are included in the dataset and
equally distributed, the function \texttt{df.value\_counts()} can be
used.

Following parameters are for fine tuning:

\begin{itemize}
\tightlist
\item
  \texttt{ascending=False}: sort resulting classes descending
\item
  \texttt{dropna=False} causes that NaN values are included
\item
  \texttt{normalize=True}: relative frequencies of the unique values are
  returned
\end{itemize}

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{51}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Count unique values without missing values in a column,}
\PY{c+c1}{\PYZsh{} ordered descending and normalized}
\PY{n}{irisdata\PYZus{}df}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{species}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}\PY{o}{.}\PY{n}{value\PYZus{}counts}\PY{p}{(}\PY{n}{ascending}\PY{o}{=}\PY{k+kc}{False}\PY{p}{,} \PY{n}{dropna}\PY{o}{=}\PY{k+kc}{False}\PY{p}{,} \PY{n}{normalize}\PY{o}{=}\PY{k+kc}{False}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

            \begin{tcolorbox}[breakable, size=fbox, boxrule=.5pt, pad at break*=1mm, opacityfill=0]
\prompt{Out}{outcolor}{51}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: species, dtype: int64
\end{Verbatim}
\end{tcolorbox}
        
    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{52}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Import (again) employees dataset to dataframe from csv file}
\PY{c+c1}{\PYZsh{}csv\PYZus{}filepath = r\PYZsq{}./datasets/employees\PYZus{}edit.csv\PYZsq{}}
\PY{n}{csv\PYZus{}filepath} \PY{o}{=} \PY{l+s+sa}{r}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{./datasets/employees\PYZus{}edit\PYZus{}repaired.csv}\PY{l+s+s1}{\PYZsq{}}

\PY{n}{employees\PYZus{}df} \PY{o}{=} \PY{n}{pd}\PY{o}{.}\PY{n}{read\PYZus{}csv}\PY{p}{(}\PY{n}{csv\PYZus{}filepath}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{53}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Count unique values and missing values in a column,}
\PY{c+c1}{\PYZsh{} ordered descending and absolute values}
\PY{n}{employees\PYZus{}df}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Team}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}\PY{o}{.}\PY{n}{value\PYZus{}counts}\PY{p}{(}\PY{n}{ascending}\PY{o}{=}\PY{k+kc}{False}\PY{p}{,} \PY{n}{dropna}\PY{o}{=}\PY{k+kc}{False}\PY{p}{,} \PY{n}{normalize}\PY{o}{=}\PY{k+kc}{False}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

            \begin{tcolorbox}[breakable, size=fbox, boxrule=.5pt, pad at break*=1mm, opacityfill=0]
\prompt{Out}{outcolor}{53}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
Client Services         99
Business Development    98
Finance                 97
Product                 92
Marketing               91
Legal                   86
Engineering             86
Sales                   86
Human Resources         85
Distribution            77
Name: Team, dtype: int64
\end{Verbatim}
\end{tcolorbox}
        
    \hypertarget{display-histogram}{%
\subsubsection{Display Histogram}\label{display-histogram}}

This section was inspired by:
\href{https://dataindependent.com/pandas/pandas-histogram-dataframe-hist/}{Pandas
Histogram -- DataFrame.hist()}.

\textbf{Histograms} represent \textbf{frequency distributions}
graphically. This requires the separation of the data into classes
(so-called \textbf{bins}).

These bins are represented in the histogram as rectangles of equal or
variable width. The height of each rectangle then represents the
(relative or absolute) \textbf{frequency density}.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{54}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{employees\PYZus{}df}\PY{o}{.}\PY{n}{hist}\PY{p}{(}\PY{n}{column}\PY{o}{=}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Salary}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}\PY{p}{,} \PY{n}{bins} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{auto}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{density}\PY{o}{=}\PY{k+kc}{True}\PY{p}{,} \PY{n}{rwidth}\PY{o}{=}\PY{l+m+mf}{0.95}\PY{p}{,} 
                  \PY{n}{zorder}\PY{o}{=}\PY{l+m+mi}{2}\PY{p}{,} \PY{n}{alpha}\PY{o}{=}\PY{l+m+mf}{0.8}\PY{p}{)}

\PY{c+c1}{\PYZsh{} Set the title of the histogram}
\PY{c+c1}{\PYZsh{} pad ... defines the distance of the title from the top of the histogram}
\PY{n}{plt}\PY{o}{.}\PY{n}{title}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Salary distribution over all gender}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{pad}\PY{o}{=}\PY{l+m+mi}{10}\PY{p}{)}
\PY{n}{plt}\PY{o}{.}\PY{n}{xlabel}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{salary}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}
\PY{n}{plt}\PY{o}{.}\PY{n}{ylabel}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{frequency density (relative)}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}
\PY{c+c1}{\PYZsh{} Show grid and hide behind the bars}
\PY{n}{plt}\PY{o}{.}\PY{n}{grid}\PY{p}{(}\PY{n}{visible}\PY{o}{=}\PY{k+kc}{True}\PY{p}{,} \PY{n}{zorder}\PY{o}{=}\PY{o}{\PYZhy{}}\PY{l+m+mf}{1.0}\PY{p}{)}
\PY{n}{plt}\PY{o}{.}\PY{n}{show}\PY{p}{(}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{figure}
        \begin{center}\includegraphics[scale=0.6]{Step-by-step_intro_to_ML_with_SVC_and_Iris_files/Step-by-step_intro_to_ML_with_SVC_and_Iris_143_0.png}\end{center}
        \caption{Histogram showing the relative frequency distribution of the salary}
        \label{fig:histogram_salary}
    \end{figure}
    
    Apart from the not very appealing \textbf{standard formatting of the
histogram} above, there is also \textbf{no breakdown of salaries} by
\textbf{gender} here.

The following function allows a \textbf{gender-specific presentation} of
salaries with significantly \textbf{more information content} in the
individual subplots.

In addition, \textbf{probability density functions (PDF)} were overlaid
on the histograms, whose hyper-parameters \textbf{mean} and
\textbf{standard deviation} were previously identified using the
features of the dataset. This makes it possible to estimate whether the
\textbf{data is normally distributed}. In order to be able to reuse the
code later, it was implemented as the \textbf{function}
\texttt{func\_plot\_histograms\_from\_list\_with\_PDF()}.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{55}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{k+kn}{from} \PY{n+nn}{scipy}\PY{n+nn}{.}\PY{n+nn}{stats} \PY{k+kn}{import} \PY{n}{norm}

\PY{k}{def} \PY{n+nf}{func\PYZus{}plot\PYZus{}histograms\PYZus{}from\PYZus{}list\PYZus{}with\PYZus{}PDF}\PY{p}{(}\PY{n}{df\PYZus{}list}\PY{p}{,} \PY{n}{column}\PY{p}{,} \PY{n}{titles}\PY{p}{)}\PY{p}{:}
    \PY{c+c1}{\PYZsh{} Number of bins for the histogram}
    \PY{c+c1}{\PYZsh{} \PYZhy{} bins=\PYZlt{}integer\PYZgt{}: defines the number of equal\PYZhy{}width bins in the range}
    \PY{c+c1}{\PYZsh{} \PYZhy{} bins=\PYZlt{}string\PYZgt{}: one of the binning strategies is used:}
    \PY{c+c1}{\PYZsh{}   \PYZsq{}auto\PYZsq{}, \PYZsq{}fd\PYZsq{}, \PYZsq{}doane\PYZsq{}, \PYZsq{}scott\PYZsq{}, \PYZsq{}stone\PYZsq{}, \PYZsq{}rice\PYZsq{}, \PYZsq{}sturges\PYZsq{}, or \PYZsq{}sqrt\PYZsq{}}
    \PY{n}{n\PYZus{}bins} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{auto}\PY{l+s+s1}{\PYZsq{}}
    \PY{n}{subplot\PYZus{}columns} \PY{o}{=} \PY{n+nb}{len}\PY{p}{(}\PY{n}{df\PYZus{}list}\PY{p}{)}
    \PY{n}{fig}\PY{p}{,} \PY{n}{subplots} \PY{o}{=} \PY{n}{plt}\PY{o}{.}\PY{n}{subplots}\PY{p}{(}\PY{l+m+mi}{1}\PY{p}{,} \PY{n}{subplot\PYZus{}columns}\PY{p}{,} \PY{n}{figsize}\PY{o}{=}\PY{p}{(}\PY{l+m+mi}{14}\PY{p}{,} \PY{l+m+mi}{4}\PY{p}{)}\PY{p}{)}
    \PY{c+c1}{\PYZsh{} Set margins between subplots}
    \PY{n}{plt}\PY{o}{.}\PY{n}{subplots\PYZus{}adjust}\PY{p}{(}\PY{n}{wspace}\PY{o}{=}\PY{l+m+mf}{0.3}\PY{p}{,} \PY{n}{hspace}\PY{o}{=}\PY{l+m+mf}{0.4}\PY{p}{)}
    \PY{n}{ylim\PYZus{}max} \PY{o}{=} \PY{p}{(}\PY{l+m+mf}{0.0}\PY{p}{,} \PY{l+m+mf}{0.0}\PY{p}{)}

    \PY{c+c1}{\PYZsh{} Make subplots iterable via \PYZsq{}subplots.flatten()\PYZsq{}}
    \PY{k}{for} \PY{n}{df}\PY{p}{,} \PY{n}{title}\PY{p}{,} \PY{n}{subplot} \PY{o+ow}{in} \PY{n+nb}{zip}\PY{p}{(}\PY{n}{df\PYZus{}list}\PY{p}{,} \PY{n}{titles}\PY{p}{,} \PY{n}{subplots}\PY{o}{.}\PY{n}{flatten}\PY{p}{(}\PY{p}{)}\PY{p}{)}\PY{p}{:}

        \PY{n}{subplot}\PY{o}{.}\PY{n}{hist}\PY{p}{(}\PY{n}{df}\PY{p}{[}\PY{n}{column}\PY{p}{]}\PY{p}{,} \PY{n}{bins} \PY{o}{=} \PY{n}{n\PYZus{}bins}\PY{p}{,} \PY{n}{density}\PY{o}{=}\PY{k+kc}{True}\PY{p}{,} 
                     \PY{n}{rwidth}\PY{o}{=}\PY{l+m+mf}{0.95}\PY{p}{,} \PY{n}{alpha}\PY{o}{=}\PY{l+m+mf}{0.8}\PY{p}{)}

        \PY{c+c1}{\PYZsh{} Fit a normal distribution to the data}
        \PY{c+c1}{\PYZsh{} with mean and standard deviation}
        \PY{n}{mu}\PY{p}{,} \PY{n}{std} \PY{o}{=} \PY{n}{norm}\PY{o}{.}\PY{n}{fit}\PY{p}{(}\PY{n}{df}\PY{p}{[}\PY{n}{column}\PY{p}{]}\PY{p}{)}

        \PY{c+c1}{\PYZsh{} Plot the probability density function (PDF)}
        \PY{n}{xmin}\PY{p}{,} \PY{n}{xmax} \PY{o}{=} \PY{n}{subplot}\PY{o}{.}\PY{n}{get\PYZus{}xlim}\PY{p}{(}\PY{p}{)}

        \PY{n}{x} \PY{o}{=} \PY{n}{np}\PY{o}{.}\PY{n}{linspace}\PY{p}{(}\PY{n}{xmin}\PY{p}{,} \PY{n}{xmax}\PY{p}{,} \PY{l+m+mi}{100}\PY{p}{)}
        \PY{n}{p} \PY{o}{=} \PY{n}{norm}\PY{o}{.}\PY{n}{pdf}\PY{p}{(}\PY{n}{x}\PY{p}{,} \PY{n}{mu}\PY{p}{,} \PY{n}{std}\PY{p}{)}

        \PY{n}{subplot}\PY{o}{.}\PY{n}{plot}\PY{p}{(}\PY{n}{x}\PY{p}{,} \PY{n}{p}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{k}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{linewidth}\PY{o}{=}\PY{l+m+mi}{2}\PY{p}{)}

        \PY{n}{title\PYZus{}concat} \PY{o}{=} \PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{Salary of }\PY{l+s+si}{\PYZob{}\PYZcb{}}\PY{l+s+s2}{ (Mean: }\PY{l+s+si}{\PYZob{}:.2f\PYZcb{}}\PY{l+s+s2}{, }\PY{l+s+se}{\PYZbs{}n}\PY{l+s+s2}{\PYZdq{}} \PYZbs{}
                               \PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{Std. deviation: }\PY{l+s+si}{\PYZob{}:.2f\PYZcb{}}\PY{l+s+s2}{)}\PY{l+s+s2}{\PYZdq{}}\PY{o}{.}\PY{n}{format}\PY{p}{(}\PY{n}{title}\PY{p}{,} \PY{n}{mu}\PY{p}{,} \PY{n}{std}\PY{p}{)}
        \PY{c+c1}{\PYZsh{} Set the title of the histogram}
        \PY{c+c1}{\PYZsh{} pad ... defines the distance of the title from the top of the histogram}
        \PY{n}{subplot}\PY{o}{.}\PY{n}{set\PYZus{}title}\PY{p}{(}\PY{n}{title\PYZus{}concat}\PY{p}{,} \PY{n}{pad}\PY{o}{=}\PY{l+m+mi}{20}\PY{p}{)}
        \PY{c+c1}{\PYZsh{} Show grid}
        \PY{n}{subplot}\PY{o}{.}\PY{n}{grid}\PY{p}{(}\PY{n}{visible}\PY{o}{=}\PY{k+kc}{True}\PY{p}{)}
        \PY{c+c1}{\PYZsh{} Hide grid behind the bars}
        \PY{n}{subplot}\PY{o}{.}\PY{n}{set\PYZus{}axisbelow}\PY{p}{(}\PY{k+kc}{True}\PY{p}{)}
        \PY{c+c1}{\PYZsh{} Label x and y\PYZhy{}axis}
        \PY{n}{subplot}\PY{o}{.}\PY{n}{set\PYZus{}xlabel}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{salary}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}
        \PY{n}{subplot}\PY{o}{.}\PY{n}{set\PYZus{}ylabel}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{frequency density (relative)}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}
        \PY{c+c1}{\PYZsh{} Rotate x\PYZhy{}ticks by \PYZhy{}45°}
        \PY{n}{subplot}\PY{o}{.}\PY{n}{tick\PYZus{}params}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{x}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{labelrotation}\PY{o}{=}\PY{o}{\PYZhy{}}\PY{l+m+mi}{45}\PY{p}{)}
        \PY{c+c1}{\PYZsh{} Get maximum range of y\PYZhy{}axes over all histograms}
        \PY{k}{if} \PY{n}{ylim\PYZus{}max}\PY{p}{[}\PY{l+m+mi}{1}\PY{p}{]} \PY{o}{\PYZlt{}} \PY{n}{subplot}\PY{o}{.}\PY{n}{get\PYZus{}ylim}\PY{p}{(}\PY{p}{)}\PY{p}{[}\PY{l+m+mi}{1}\PY{p}{]}\PY{p}{:}
            \PY{c+c1}{\PYZsh{} Take new maximum}
            \PY{n}{ylim\PYZus{}max} \PY{o}{=} \PY{n}{subplot}\PY{o}{.}\PY{n}{get\PYZus{}ylim}\PY{p}{(}\PY{p}{)}

    \PY{c+c1}{\PYZsh{} Set all y\PYZhy{}axes to the same range for comparison}
    \PY{n}{plt}\PY{o}{.}\PY{n}{setp}\PY{p}{(}\PY{n}{subplots}\PY{p}{,} \PY{n}{ylim}\PY{o}{=}\PY{n}{ylim\PYZus{}max}\PY{p}{)}
    \PY{n}{plt}\PY{o}{.}\PY{n}{show}\PY{p}{(}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{56}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{genders} \PY{o}{=} \PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Male}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Female}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{No Gender}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}

\PY{c+c1}{\PYZsh{} Create list for storing the dataframes}
\PY{n}{li\PYZus{}employees\PYZus{}df} \PY{o}{=} \PY{n+nb}{list}\PY{p}{(}\PY{p}{)}

\PY{c+c1}{\PYZsh{} Filter employees by gender}
\PY{k}{for} \PY{n}{gender} \PY{o+ow}{in} \PY{n}{genders}\PY{p}{:}
    \PY{n}{li\PYZus{}employees\PYZus{}df}\PY{o}{.}\PY{n}{append}\PY{p}{(}\PY{n}{employees\PYZus{}df}\PY{o}{.}\PY{n}{loc}\PY{p}{[}\PY{p}{(}\PY{n}{employees\PYZus{}df}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Gender}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]} \PY{o}{==} \PY{n}{gender}\PY{p}{)}\PY{p}{]}\PY{p}{)}

\PY{n}{func\PYZus{}plot\PYZus{}histograms\PYZus{}from\PYZus{}list\PYZus{}with\PYZus{}PDF}\PY{p}{(}\PY{n}{li\PYZus{}employees\PYZus{}df}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Salary}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{genders}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{figure}
        \begin{center}\adjustimage{max size={0.9\linewidth}{0.9\paperheight}}{Step-by-step_intro_to_ML_with_SVC_and_Iris_files/Step-by-step_intro_to_ML_with_SVC_and_Iris_146_0.png}\end{center}
        \caption{Histograms used to explore the relative frequency distribution of the salary in comparison between the genders (with overlaid probability density functions (PDF))}
        \label{fig:histogram_salary_with_PDF}
    \end{figure}
    
    \hypertarget{first-idea-of-correlations-in-dataset}{%
\subsection{\texorpdfstring{First \textbf{idea of correlations} in
dataset}{First idea of correlations in dataset}}\label{first-idea-of-correlations-in-dataset}}

To get a rough idea of the \textbf{dependencies} and
\textbf{correlations} in the dataset, it can be helpful to visualize the
whole dataset in a \textbf{correlation heatmap}. They show in a glance
which variables are correlated, to what degree and in which direction.

Later, 2 particularly well correlated variables are selected from the
dataset and plotted in a \textbf{scatterplot}.

    \hypertarget{visualize-data-with-correlation-heatmap}{%
\subsubsection{\texorpdfstring{Visualize data with \textbf{correlation
heatmap}}{Visualize data with correlation heatmap}}\label{visualize-data-with-correlation-heatmap}}

This section was inspired by
\href{https://medium.com/@szabo.bibor/how-to-create-a-seaborn-correlation-heatmap-in-python-834c0686b88e}{How
to Create a Seaborn Correlation Heatmap in Python?}.

\begin{quote}
\textbf{Correlation matrices} are an \textbf{essential tool of
exploratory data analysis}. Correlation heatmaps contain the same
information in a visually appealing way. What more: they show in a
glance which variables are correlated, to what degree, in which
direction, and alerts us to potential multicollinearity problems
(source: ibidem).
\end{quote}

\hypertarget{simple-correlation-matrix}{%
\paragraph{Simple correlation matrix}\label{simple-correlation-matrix}}

Because \textbf{string values can never be correlated}, the class names
(species) have to be converted first:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{57}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} encoding the class column}
\PY{n}{irisdata\PYZus{}df\PYZus{}enc} \PY{o}{=} \PY{n}{irisdata\PYZus{}df}\PY{o}{.}\PY{n}{replace}\PY{p}{(}\PY{p}{\PYZob{}}\PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{species}\PY{l+s+s2}{\PYZdq{}}\PY{p}{:}  \PY{p}{\PYZob{}}\PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{Iris\PYZhy{}setosa}\PY{l+s+s2}{\PYZdq{}}\PY{p}{:}\PY{l+m+mi}{0}\PY{p}{,}
                                                    \PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{Iris\PYZhy{}versicolor}\PY{l+s+s2}{\PYZdq{}}\PY{p}{:}\PY{l+m+mi}{1}\PY{p}{,} 
                                                    \PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{Iris\PYZhy{}virginica}\PY{l+s+s2}{\PYZdq{}}\PY{p}{:}\PY{l+m+mi}{2}\PY{p}{\PYZcb{}}\PY{p}{\PYZcb{}}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{58}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{str\PYZus{}caption} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Simple correlation matrix}\PY{l+s+s1}{\PYZsq{}}
\PY{n}{func\PYZus{}render\PYZus{}dataframe2Markdown}\PY{p}{(}\PY{n}{irisdata\PYZus{}df\PYZus{}enc}\PY{o}{.}\PY{n}{corr}\PY{p}{(}\PY{p}{)}\PY{p}{,} \PY{n}{str\PYZus{}caption}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{longtable}[]{@{}
  >{\raggedright\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.1609}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.1839}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.1724}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.1839}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.1724}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.1264}}@{}}
\caption{Simple correlation matrix}\tabularnewline
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedright
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
species
\end{minipage} \\
\midrule\noalign{}
\endfirsthead
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedright
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
species
\end{minipage} \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
sepal\_length & 1 & -0.109369 & 0.871754 & 0.817954 & 0.782561 \\
sepal\_width & -0.109369 & 1 & -0.420516 & -0.356544 & -0.419446 \\
petal\_length & 0.871754 & -0.420516 & 1 & 0.962757 & 0.949043 \\
petal\_width & 0.817954 & -0.356544 & 0.962757 & 1 & 0.956464 \\
species & 0.782561 & -0.419446 & 0.949043 & 0.956464 & 1 \\
\end{longtable}

    
    \hypertarget{correlation-heatmap}{%
\paragraph{Correlation heatmap}\label{correlation-heatmap}}

The color sets are chosen from
\href{https://pod.hatenablog.com/entry/2018/09/20/212527}{color map}.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{59}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} increase the size of the heatmap}
\PY{n}{plt}\PY{o}{.}\PY{n}{figure}\PY{p}{(}\PY{n}{figsize}\PY{o}{=}\PY{p}{(}\PY{l+m+mi}{16}\PY{p}{,} \PY{l+m+mi}{6}\PY{p}{)}\PY{p}{)}

\PY{c+c1}{\PYZsh{} store heatmap object in a variable to easily access it }
\PY{c+c1}{\PYZsh{} when you want to include more features (such as title)}
\PY{c+c1}{\PYZsh{} set the range of values to be displayed on the colormap from \PYZhy{}1 to 1,}
\PY{c+c1}{\PYZsh{} and set \PYZsq{}annotation=True\PYZsq{} to display the correlation values on the heatmap}
\PY{n}{heatmap} \PY{o}{=} \PY{n}{sns}\PY{o}{.}\PY{n}{heatmap}\PY{p}{(}\PY{n}{irisdata\PYZus{}df\PYZus{}enc}\PY{o}{.}\PY{n}{corr}\PY{p}{(}\PY{p}{)}\PY{p}{,} \PY{n}{vmin}\PY{o}{=}\PY{o}{\PYZhy{}}\PY{l+m+mi}{1}\PY{p}{,} \PY{n}{vmax}\PY{o}{=}\PY{l+m+mi}{1}\PY{p}{,} 
                      \PY{n}{annot}\PY{o}{=}\PY{k+kc}{True}\PY{p}{,} \PY{n}{cmap}\PY{o}{=}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{PRGn\PYZus{}r}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}

\PY{c+c1}{\PYZsh{} set a title to the heatmap}
\PY{c+c1}{\PYZsh{} pad ... defines the distance of the title from the top of the heatmap}
\PY{n}{heatmap}\PY{o}{.}\PY{n}{set\PYZus{}title}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Correlation Heatmap}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{pad}\PY{o}{=}\PY{l+m+mi}{16}\PY{p}{)}
\PY{n}{plt}\PY{o}{.}\PY{n}{show}\PY{p}{(}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{figure}
        \begin{center}\adjustimage{max size={0.9\linewidth}{0.9\paperheight}}{Step-by-step_intro_to_ML_with_SVC_and_Iris_files/Step-by-step_intro_to_ML_with_SVC_and_Iris_152_0.png}\end{center}
        \caption{Correlation heatmap to explore coherences between single variables in the Iris dataset}
        \label{fig:correlation_heatmap}
    \end{figure}
    
    \hypertarget{triangle-correlation-heatmap}{%
\paragraph{Triangle correlation
heatmap}\label{triangle-correlation-heatmap}}

When looking at the correlation heatmap above, you would not lose any
information by \textbf{cutting} away half of it \textbf{along the
diagonal} line marked by 1-s.

The \textbf{numpy} function \texttt{np.triu()} can be used to isolate
the upper triangle of a matrix while turning all the values in the lower
triangle into 0.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{60}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{np}\PY{o}{.}\PY{n}{triu}\PY{p}{(}\PY{n}{np}\PY{o}{.}\PY{n}{ones\PYZus{}like}\PY{p}{(}\PY{n}{irisdata\PYZus{}df\PYZus{}enc}\PY{o}{.}\PY{n}{corr}\PY{p}{(}\PY{p}{)}\PY{p}{)}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

            \begin{tcolorbox}[breakable, size=fbox, boxrule=.5pt, pad at break*=1mm, opacityfill=0]
\prompt{Out}{outcolor}{60}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
array([[1., 1., 1., 1., 1.],
       [0., 1., 1., 1., 1.],
       [0., 0., 1., 1., 1.],
       [0., 0., 0., 1., 1.],
       [0., 0., 0., 0., 1.]])
\end{Verbatim}
\end{tcolorbox}
        
    Use this mask to cut the heatmap along the diagonal:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{61}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{plt}\PY{o}{.}\PY{n}{figure}\PY{p}{(}\PY{n}{figsize}\PY{o}{=}\PY{p}{(}\PY{l+m+mi}{16}\PY{p}{,} \PY{l+m+mi}{6}\PY{p}{)}\PY{p}{)}

\PY{c+c1}{\PYZsh{} define the mask to set the values in the upper triangle to \PYZsq{}True\PYZsq{}}
\PY{n}{mask} \PY{o}{=} \PY{n}{np}\PY{o}{.}\PY{n}{triu}\PY{p}{(}\PY{n}{np}\PY{o}{.}\PY{n}{ones\PYZus{}like}\PY{p}{(}\PY{n}{irisdata\PYZus{}df\PYZus{}enc}\PY{o}{.}\PY{n}{corr}\PY{p}{(}\PY{p}{)}\PY{p}{,} \PY{n}{dtype}\PY{o}{=}\PY{n+nb}{bool}\PY{p}{)}\PY{p}{)}

\PY{n}{heatmap} \PY{o}{=} \PY{n}{sns}\PY{o}{.}\PY{n}{heatmap}\PY{p}{(}\PY{n}{irisdata\PYZus{}df\PYZus{}enc}\PY{o}{.}\PY{n}{corr}\PY{p}{(}\PY{p}{)}\PY{p}{,} \PY{n}{mask}\PY{o}{=}\PY{n}{mask}\PY{p}{,} 
                      \PY{n}{vmin}\PY{o}{=}\PY{o}{\PYZhy{}}\PY{l+m+mi}{1}\PY{p}{,} \PY{n}{vmax}\PY{o}{=}\PY{l+m+mi}{1}\PY{p}{,} \PY{n}{annot}\PY{o}{=}\PY{k+kc}{True}\PY{p}{,} \PY{n}{cmap}\PY{o}{=}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{PRGn\PYZus{}r}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}

\PY{n}{heatmap}\PY{o}{.}\PY{n}{set\PYZus{}title}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Triangle Correlation Heatmap}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{pad}\PY{o}{=}\PY{l+m+mi}{16}\PY{p}{)}
\PY{n}{plt}\PY{o}{.}\PY{n}{show}\PY{p}{(}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{figure}
        \begin{center}\adjustimage{max size={0.9\linewidth}{0.9\paperheight}}{Step-by-step_intro_to_ML_with_SVC_and_Iris_files/Step-by-step_intro_to_ML_with_SVC_and_Iris_156_0.png}\end{center}
        \caption{Correlation heatmap, which was cut at its main diagonal without losing any information}
        \label{fig:correlation_heatmap_triangle}
    \end{figure}
    
    As a result from the \textbf{heatmaps} we can see, that the shape of the
\textbf{petals} are the \textbf{most correlationed columns} (0.96) with
the \textbf{type of flowers} (species classes).

Somewhat lower correlates \textbf{sepal length} with \textbf{petal
length} (0.87).

    \hypertarget{visualize-data-with-scatter-plot}{%
\subsubsection{\texorpdfstring{Visualize data with \textbf{scatter
plot}}{Visualize data with scatter plot}}\label{visualize-data-with-scatter-plot}}

In the following, \href{https://seaborn.pydata.org/}{Seaborn} is applied
which is a library for making statistical graphics in Python. It is
built on top of matplotlib and closely integrated with \texttt{Pandas}
data structures.

To investigate whether there are dependencies (e.g.~correlations) in
\texttt{irisdata\_df} between individual variables in the dataset, it is
advisable to plot them in a \textbf{scatter plot}.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{62}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} There are five preset seaborn themes: darkgrid, whitegrid, dark, white, and ticks.}
\PY{n}{sns}\PY{o}{.}\PY{n}{set\PYZus{}style}\PY{p}{(}\PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{whitegrid}\PY{l+s+s2}{\PYZdq{}}\PY{p}{)}

\PY{c+c1}{\PYZsh{} \PYZsq{}sepal\PYZus{}length\PYZsq{}, \PYZsq{}petal\PYZus{}length\PYZsq{} are Iris feature data}
\PY{c+c1}{\PYZsh{} \PYZsq{}height\PYZsq{} used to define height of graph}
\PY{c+c1}{\PYZsh{} \PYZsq{}hue\PYZsq{} stores the class/label of Iris dataset}
\PY{n}{sns}\PY{o}{.}\PY{n}{FacetGrid}\PY{p}{(}\PY{n}{irisdata\PYZus{}df}\PY{p}{,} \PY{n}{hue} \PY{o}{=}\PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{species}\PY{l+s+s2}{\PYZdq{}}\PY{p}{,}
              \PY{n}{height} \PY{o}{=} \PY{l+m+mi}{5}\PY{p}{)}\PY{o}{.}\PY{n}{map}\PY{p}{(}\PY{n}{plt}\PY{o}{.}\PY{n}{scatter}\PY{p}{,}
                              \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{petal\PYZus{}width}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,}
                              \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{petal\PYZus{}length}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}\PY{o}{.}\PY{n}{add\PYZus{}legend}\PY{p}{(}\PY{p}{)}


\PY{c+c1}{\PYZsh{} y .. padding between title and plot}
\PY{n}{plt}\PY{o}{.}\PY{n}{title}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Scatterplot of petal length and width}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{y}\PY{o}{=}\PY{l+m+mf}{1.05}\PY{p}{)}
\PY{n}{plt}\PY{o}{.}\PY{n}{xlabel}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{petal width [cm]}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}
\PY{n}{plt}\PY{o}{.}\PY{n}{ylabel}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{petal length [cm]}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}
\PY{n}{plt}\PY{o}{.}\PY{n}{show}\PY{p}{(}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{figure}
        \begin{center}\includegraphics[scale=0.6]{Step-by-step_intro_to_ML_with_SVC_and_Iris_files/Step-by-step_intro_to_ML_with_SVC_and_Iris_159_0.png}\end{center}
        \caption{Plotting two individual variables of the iris dataset in the scatterplot to explore the relationships between these two}
        \label{fig:scatter_plot}
    \end{figure}
    
    \hypertarget{visualize-data-with-pairs-plot}{%
\subsubsection{\texorpdfstring{Visualize data with \textbf{pairs
plot}}{Visualize data with pairs plot}}\label{visualize-data-with-pairs-plot}}

For systematic investigation of dependencies, all variables (each
against each) are plotted in separate scatter plots.

With this so called
\textbf{\href{https://vita.had.co.nz/papers/gpp.pdf}{pairs plot}} it is
possible to see both \textbf{relationships} between two variables and
\textbf{distribution} of single variables.

This function will create a grid of Axes such that \textbf{each numeric
variable} in \texttt{irisdata\_df} will by shared in the y-axis across a
single row and in the x-axis across a single column.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{63}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{sns}\PY{o}{.}\PY{n}{set\PYZus{}style}\PY{p}{(}\PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{white}\PY{l+s+s2}{\PYZdq{}}\PY{p}{)}

\PY{n}{g} \PY{o}{=} \PY{n}{sns}\PY{o}{.}\PY{n}{pairplot}\PY{p}{(}\PY{n}{irisdata\PYZus{}df}\PY{p}{,} \PY{n}{diag\PYZus{}kind}\PY{o}{=}\PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{kde}\PY{l+s+s2}{\PYZdq{}}\PY{p}{,} \PY{n}{hue}\PY{o}{=}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{species}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} 
                 \PY{n}{palette}\PY{o}{=}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Dark2}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{height}\PY{o}{=}\PY{l+m+mf}{2.0}\PY{p}{)}

\PY{n}{g}\PY{o}{.}\PY{n}{map\PYZus{}lower}\PY{p}{(}\PY{n}{sns}\PY{o}{.}\PY{n}{kdeplot}\PY{p}{,} \PY{n}{levels}\PY{o}{=}\PY{l+m+mi}{4}\PY{p}{,} \PY{n}{color}\PY{o}{=}\PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{.2}\PY{l+s+s2}{\PYZdq{}}\PY{p}{)}
\PY{c+c1}{\PYZsh{} x, y .. padding between title and plot}
\PY{n}{plt}\PY{o}{.}\PY{n}{title}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Pairs plot of the Iris dataset}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{x}\PY{o}{=}\PY{o}{\PYZhy{}}\PY{l+m+mf}{1.0}\PY{p}{,} \PY{n}{y}\PY{o}{=}\PY{l+m+mf}{4.3}\PY{p}{)}
\PY{n}{plt}\PY{o}{.}\PY{n}{show}\PY{p}{(}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{figure}
        \begin{center}\adjustimage{max size={0.9\linewidth}{0.9\paperheight}}{Step-by-step_intro_to_ML_with_SVC_and_Iris_files/Step-by-step_intro_to_ML_with_SVC_and_Iris_161_0.png}\end{center}
        \caption{Plot all individual variables of the Iris dataset in pairs plot to see both the relationships between two variables and the distribution of the individual variables}
        \label{fig:pairs_plot}
    \end{figure}
    
    \hypertarget{step-3-choose-and-create-the-ml-model}{%
\section{STEP 3: Choose and create the ML
model}\label{step-3-choose-and-create-the-ml-model}}

After exploring the dataset, in this step one has to decide on a
specific ML algorithm based on certain selection criteria.

However, since the AI or ML world is so huge and impossible for a ML
novice to overlook, a brief description of the \textbf{relationship
between AI and ML} is given in the following sections. Furthermore, a
\textbf{taxonomy} of the different \textbf{learning types} is presented
by also providing some example algorithms.

\hypertarget{short-overview-of-the-ai-world}{%
\subsection{Short overview of the AI
world}\label{short-overview-of-the-ai-world}}

The history of \textbf{Artificial Intelligence (AI)} with the
\href{https://en.wikipedia.org/wiki/Dartmouth_workshop}{Dartmouth
Conference} in the summer of 1956 as its birth is characterized by
several successive hype and low phases (so-called
\href{https://de.wikipedia.org/wiki/KI-Winter}{AI Winter}). During the
hype phases, many \textbf{new insights} were gathered by AI researchers
and interesting \textbf{application areas} were explored. However, there
were always times when the verifiable successes fell far short of the
previously awakened (inflated) expectations. The consequence was then a
decreasing interest in AI research and accompanying drastic cuts in
research budgets. The \textbf{high media attention} in combination with
often \textbf{vague and not very clear definitions} of AI can be seen as
a potential reason for the often circulating inflated expectations of AI
technologies.

Due to the increase in knowledge from AI research as well as the
exploration of new application areas, the technical terms and especially
the AI definitions have been subject to constant change over the past
decades. In the currently published standard
\href{https://webstore.iec.ch/publication/77839}{ISO/IEC 22989:2022-07},
\textbf{AI systems} have been defined by the \textbf{Subcommittee 42 -
`Artificial Intelligence' (SC 42)} of the \textbf{ISO/IEC Joint
Technical Committee (JTC 1)} as follows (see definition 3.1.4 in
\cite{ISO_IEC_22989_2022-07}).

The main part of the \textbf{definition} describes what an AI system
(should) do:

\begin{quote}
{[}An \textbf{artificial intelligence system} is an{]} engineered system
that generates outputs such as \textbf{content}, \textbf{forecasts},
\textbf{recommendations} or \textbf{decisions} for a given set of
\textbf{human-defined objectives}.
\end{quote}

A \textbf{note} to the definition describes the techniques necessary to
achieve this:

\begin{quote}
{[}..{]} The engineered system can use various techniques and approaches
related to artificial intelligence to develop a \textbf{model} to
represent data, \textbf{knowledge}, processes, etc. which can be used to
conduct \textbf{tasks}.
\end{quote}

The \textbf{knowledge} acquires itself from abstracted information about
objects, events, concepts or rules as well as their properties and
relations to each other. It is organized for purposeful systematic use.
The \textbf{model} is represented by a physical, mathematical, or
otherwise logical representation of a system. Whereas the \textbf{task}
consists of a set of actions required to achieve a specific goal.

\textbf{Machine Learning (ML)} as a subset of AI, on the other hand,
addresses the mathematical models and algorithms that enable a computer
system to recognize (new) correlations in huge amounts of sample data
from various sources by inferring them independently.

The umbrella term AI covers a very large research area. It includes a
number of techniques, like:

\begin{itemize}
\tightlist
\item
  Supervised and Unsupervised Learning,
\item
  Reinforcement Learning and
\item
  Genetic Algorithms
\end{itemize}

that enable computers to learn independently and solve complex problems
in the fields of, e.g.:

\begin{itemize}
\tightlist
\item
  Computer-Vision (CV),
\item
  Computational Linguistics (CL) or
\item
  Robotics.
\end{itemize}

The following \textbf{Venn diagram} shows the relationship between AI,
machine learning and other integrated technologies. The quantities that
do not belong to the main category represent techniques that can
function as stand-alone techniques and do not necessarily fall into the
artificial intelligence group in all cases (\cite{AI_beverages_2019}).

For example, simple \textbf{robotic behaviors} can be realized via fixed
pre-programmed \textbf{if-then-else decisions}. In images, objects can
be identified by
\href{https://en.wikipedia.org/wiki/Edge_detection}{edge detection} by
applying, for example,
\href{https://en.wikipedia.org/wiki/Sobel_operator}{Sobel} or
\href{https://en.wikipedia.org/wiki/Laplace_operator}{Laplace filters}.
In both examples, no learnable algorithms are needed, therefore the Venn
diagram was adapted accordingly.

    \begin{figure}
\centering
\includegraphics{images/AI_ML_venn_diagram_wide.png}
\caption{Venn diagram showing the relationship between AI, machine
learning and other integrated technologies (source: Kasper, adapted from
\cite{AI_beverages_2019}, license: CC BY-SA 4.0)}
\end{figure}

    \hypertarget{taxonomy-of-machine-learning-algorithms}{%
\subsection{Taxonomy of machine learning
algorithms}\label{taxonomy-of-machine-learning-algorithms}}

The field of machine learning can be divided into the following
\textbf{types of learning}:

\begin{itemize}
\tightlist
\item
  Supervised learning
\item
  Unsupervised learning
\item
  Semi-supervised learning
\item
  Reinforcement learning
\end{itemize}

Here are some further sources:

\begin{itemize}
\tightlist
\item
  \href{https://subscription.packtpub.com/book/big-data-/9781783558742/1/ch01lvl1sec12/taxonomy-of-machine-learning-algorithms}{Taxonomy
  of machine learning algorithms}
\item
  \href{https://www.researchgate.net/publication/340878018_Comprehensive_Survey_of_Machine_Learning_Approaches_in_Cognitive_Radio-Based_Vehicular_Ad_Hoc_Networks}{Comprehensive
  Survey of Machine Learning Approaches in Cognitive Radio-Based
  Vehicular Ad Hoc Networks}
\item
  \href{https://www.researchgate.net/publication/358089496_A_Taxonomy_of_Machine_Learning_Techniques}{A
  Taxonomy of Machine Learning Techniques}
\item
  \href{https://medium.com/@Shaier/ml-algorithms-one-sd-\%CF\%83-74bcb28fafb6}{ML
  Algorithms: One SD}
\item
  \href{https://github.com/trekhleb/homemade-machine-learning\#machine-learning-map}{Machine
  Learning Map}
\end{itemize}

    \hypertarget{supervised-learning}{%
\subsubsection{Supervised learning}\label{supervised-learning}}

The goal of
\href{https://en.wikipedia.org/wiki/Supervised_learning}{supervised
learning} is to learn a \textbf{function} that maps a \textbf{input to
an output}, based on example input-output pairs. This involves inferring
a relationship describable by a mathematical function from
\textbf{labeled training data} consisting of a set of training examples
(\cite{Wiki_Supervised_learning}).

A few well-known algorithms from the field of \textbf{supervised
learning} are mentioned here:

\begin{itemize}
\tightlist
\item
  Naive Bayes
\item
  Linear Regression
\item
  Logistic Regression
\item
  Artificial Neural Networks (ANN)
\item
  Support Vector Classifier (SVC)
\item
  Decision Trees
\item
  Random Forests
\end{itemize}

    \hypertarget{unsupervised-learning}{%
\subsubsection{Unsupervised learning}\label{unsupervised-learning}}

The algorithms of
\href{https://en.wikipedia.org/wiki/Unsupervised_learning}{unsupervised
learning} look for internal structures in the data of a dataset, such as
\textbf{grouping} or \textbf{clustering of data points}. These
algorithms can thus learn relationships from test data that have not
been labeled, classified, or categorized. Rather than responding to
feedback (as in supervised learning), unsupervised learning algorithms
detect \textbf{commonalities in the data} and respond based on the
presence or absence of such commonalities in each new dataset
(\cite{Wiki_Unsupervised_learning}).

Here are some algorithms from the field of \textbf{unsupervised
learning}:

\begin{itemize}
\tightlist
\item
  K-means Clustering
\item
  Spectral Clustering
\item
  Hierarchical Clustering
\item
  Principal Component Analysis (PCA)
\end{itemize}

    \hypertarget{semi-supervised-learning}{%
\subsubsection{Semi-supervised
learning}\label{semi-supervised-learning}}

The
\href{https://en.wikipedia.org/wiki/Semi-supervised_learning}{semi-supervised
learning} falls between \textbf{unsupervised} learning (without any
labeled training data) and \textbf{supervised} learning (with completely
labeled training data). Some of the training examples are missing
training labels, yet many machine-learning researchers have found that
unlabeled data, when used in conjunction with a small amount of labeled
data, can produce a considerable improvement in learning accuracy
(\cite{Wiki_Semi-supervised_learning}).

    \hypertarget{reinforcement-learning}{%
\subsubsection{Reinforcement learning}\label{reinforcement-learning}}

This is an area of machine learning concerned with how
\textbf{intelligent agents} ought to \textbf{take actions in an
environment} in order to maximize the notion of cumulative
\textbf{reward}. Due to its generality, the field is studied in many
other disciplines, such as \textbf{game theory} and \textbf{control
theory}.

\href{https://en.wikipedia.org/wiki/Reinforcement_learning}{Reinforcement
learning} differs from supervised learning in \textbf{not needing
labeled input/output pairs} be presented and in not needing sub-optimal
actions to be explicitly corrected. Instead the focus is on
\textbf{finding a balance} between \textbf{exploration} of uncharted
territory and \textbf{exploitation} of current knowledge
(\cite{Wiki_Reinforcement_learning}).

Here are some algorithms from the field of \textbf{reinforcement
learning}:

\begin{itemize}
\tightlist
\item
  Iterative Policy
\item
  Q-Learning
\item
  SARSA
\item
  Learning Classifiers
\item
  Stochastic Gradient
\item
  Genetic Algorithm
\end{itemize}

    \hypertarget{decision-graph-for-selecting-an-suitable-algorithm}{%
\subsection{Decision graph for selecting an suitable
algorithm}\label{decision-graph-for-selecting-an-suitable-algorithm}}

Now that the Iris dataset has been analyzed in terms of its data
structure and internal correlations, the most difficult task on the way
to solving a problem using machine learning arises: finding the
``right'' ML algorithm (also called \textbf{estimator}).

The diverse estimators available are more or less well qualified for the
respective problems with their partly very different data types. The
good news is that the ML software package \texttt{Scikit-Learn} provides
the following \textbf{flowchart} as a rough \textbf{guide} in choosing
the right estimator for the particular task (see:
\href{https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html}{Choosing
the right estimator}).

However, it must also be emphasized that a considerable \textbf{level of
experience} through systematic trial and error is crucial to be
successful in finding an ``optimal'' estimator.

    \begin{figure}
\centering
\includegraphics{images/scikit-learn_ml_algorithm_decision.png}
\caption{Decision graph for choosing an appropriate ML algorithm
(source:
\href{https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html}{Choosing
the right estimator}, license: unknown)}
\end{figure}

    \hypertarget{reasons-for-choosing-support-vector-classifier-svc}{%
\subsection{Reasons for choosing Support Vector Classifier
(SVC)}\label{reasons-for-choosing-support-vector-classifier-svc}}

Among other ML algorithms suitable for the Iris dataset (such as the
decision-tree-based \textbf{random-forests classifier}), the reasoned
choice here in this tutorial falls on the \textbf{support vector
classifier (SVC)}.

The following \textbf{reasons} led to the decision for the
\textbf{Support Vector Classifier (SVC)}:

\begin{itemize}
\tightlist
\item
  The aim is to predict the species using unlabeled test data, so the
  task is to \textbf{classify}.
\item
  The Iris dataset is \textbf{fully labeled} (by designating the Iris
  species).
\item
  The dataset contains significantly \textbf{less than 100k samples}.
\end{itemize}

But the most important reason is that it is \textbf{easy to understand}
how it works - so it is exactly suitable for a beginner tutorial.

    \hypertarget{operating-principal-of-svc}{%
\subsection{Operating principal of
SVC}\label{operating-principal-of-svc}}

\href{https://en.wikipedia.org/wiki/Support-vector_machine}{Support-vector
machines (SVMs)} are \textbf{supervised learning} models with associated
learning algorithms that analyze data for \textbf{classification} and
\textbf{regression} analysis (\cite{Wiki_SVM}).

Since a \textbf{classifier} is needed for the current task to classify
the Iris dataset, the following description of the operating principle
focuses on the \textbf{Support Vector Classifier (SVC)}.

    \hypertarget{support-vectors-and-hyperplane}{%
\subsubsection{Support Vectors and
hyperplane}\label{support-vectors-and-hyperplane}}

The SVC algorithm plots the training data in an \textbf{n-dimensional
space}. The number of dimensions results from the \textbf{number of
variables} or features. For the Iris dataset with its 4 features, this
would result in a 4-dimensional space. For better understanding, the
following explanation is limited to the \textbf{2-dimensional space}.

The SVC algorithm now tries to draw a \textbf{boundary with the largest
possible distance} to the next sample from the training data. This
boundary is actually a \textbf{hyperplane} whose \textbf{dimension is 1
smaller} than that of the training data (\cite{SVM_Python_2019}). For
example, in 3-dimensional space, the hyperplane would be a 2-dimensional
non-curved plane. In 2-dimensional space, a hyperplane would simply be a
straight line.

The following figure shows the principle of operation of the SVC
algorithm in 2-dimensional space with 2 classes to separate: the
hyperplanes \emph{H1} to \emph{H4} (left figure) separate the classes. A
good separation of the classes is achieved by the hyperplane that has
the \textbf{largest distance to the nearest training data point} of a
class (so-called \textbf{functional margin}). The larger the margin, the
better the classifier can later separate test data which are unknown to
it. This is called minimization of the \textbf{generalization error}.

The right graph shows the \textbf{optimal hyperplane} characterized by
\textbf{maximizing the margin} between classes
(\cite{Parameter_tuning_SVC}). The perpendicular distance of the data
points closest to the hyperplane determines their position and
orientation. These perpendicular distances are the \textbf{support
vectors} of the hyperplane - hence the algorithm got it's name.

Interestingly, the vectors that are more distant from the boundary are
not important for the calculation. Therefore, they do not need to be
loaded into the main memory, which makes the SVC very memory efficient.

    \begin{figure}
\centering
\includegraphics{images/SVC_operatingPrinciple.png}
\caption{Support Vector Classifiers (SVC) separate the data points in
classes by finding the best hyperplane by maximizing the margin to its
support vectors (source: Kasper, license: CC BY-SA 4.0)}
\end{figure}

    \hypertarget{non-linear-transformations}{%
\subsubsection{Non-linear
transformations}\label{non-linear-transformations}}

The previous example assumes that the \textbf{data is linearly
separable}. For most real cases in practice this is unfortunately not
true and the SVC can only work with hyper\textbf{planes}.

If the data are not linearly separable in the original space,
\textbf{transformations} can be applied to the data. For this purpose,
the data are transferred into a \textbf{higher-dimensional feature
space} where the objects are linearly separable by a hyperplane.
However, during \textbf{backtransformation} this hyperplane becomes
\textbf{nonlinear} and often also non-continuous
(\cite{SVM_Python_2019}).

In the following example, the figure on the left shows the original data
points in 1-dimensional space. In the first dimension, these data are
not linearly separable. After applying the transformation
\(\Phi(X) = X^2\) and adding this second dimension to our feature space,
the classes in the right figure become linearly separable
(\cite{Kernel_trick_2018}).

    \begin{figure}
\centering
\includegraphics{images/SVC_transformation.png}
\caption{Transformation of 1-dimensional data into 2-dimensional space
in order to separate the data by a linear hyperplane (source: Kasper,
license: CC BY-SA 4.0)}
\end{figure}

    \hypertarget{kernel-trick}{%
\subsubsection{Kernel trick}\label{kernel-trick}}

In the previous example, it was shown how a transformation to a higher
dimensional feature space allows the data to be separated. In order to
train an SVC and make classification predictions, mathematical
operations would need to be performed on the higher dimensional vectors
in the transformed feature space. However, in real-world applications,
there may be many features in the data. The application of
transformations involve many polynomial combinations of these features.
This leads to \textbf{extremely high computational costs}
(\cite{Kernel_trick_2018}).

The \textbf{kernel trick} provides a solution to this problem. The
``trick'' is that \textbf{kernel methods} represent the data only by a
series of \textbf{pairwise similarity comparisons} between the original
data observations \(X\) (with the original coordinates in
low-dimensional space). This allows to work in the original feature
space instead of explicitly applying the transformations \(\Phi(X)\) and
representing the data by these transformed coordinates in the higher
dimensional feature space (\cite{Kernel_trick_2018};
\cite{Kernel_trick_2018_2}).

The following \textbf{kernel types} are provided by the Python package
\texttt{scikit-learn}:

\begin{itemize}
\tightlist
\item
  linear
\item
  radial basis function (RBF)
\item
  polynomial
\item
  sigmoid
\end{itemize}

The most well-known are the
\href{https://en.wikipedia.org/wiki/Polynomial_kernel}{polynomial
kernel} and the
\href{https://en.wikipedia.org/wiki/Radial_basis_function_kernel}{radial
basis function (RBF) kernel}, which is also called the Gaussian kernel.

    \hypertarget{create-the-svc-model}{%
\subsection{Create the SVC model}\label{create-the-svc-model}}

In this step we create the SVC model choosing a \textbf{linear kernel}
with default parameters.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{64}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{k+kn}{from} \PY{n+nn}{sklearn}\PY{n+nn}{.}\PY{n+nn}{svm} \PY{k+kn}{import} \PY{n}{SVC}
\PY{n}{classifier} \PY{o}{=} \PY{n}{SVC}\PY{p}{(}\PY{n}{kernel} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{linear}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{random\PYZus{}state} \PY{o}{=} \PY{l+m+mi}{0}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \hypertarget{step-4-preprocess-the-dataset-for-training}{%
\section{STEP 4: Preprocess the dataset for
training}\label{step-4-preprocess-the-dataset-for-training}}

In this step the dataset is prepared for the actual classification by
SVC. Depending on the selected ML algorithm as well as the data
structure, it may be necessary to prepare the data before training
(e.g., by \textbf{standardization}, \textbf{normalization}, or
\textbf{discretization} to cluster the data based on thresholds).
Furthermore, errors in the dataset (e.g.~\textbf{data gaps},
\textbf{duplicates} or obvious \textbf{misentries}) should be corrected
now at the latest.

\hypertarget{heal-the-dataset}{%
\subsection{Heal the dataset}\label{heal-the-dataset}}

Through the intensive exploration of the data (see
\hyperref[step-2-explore-the-ml-dataset]{STEP 2: Explore the ML dataset}),
we know that special \textbf{preparation} of the data is \textbf{not
necessary}. The values of the Iris dataset are \textbf{complete and
without gaps}.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{65}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Import ORIGINAL Iris dataset for classification}
\PY{n}{irisdata\PYZus{}df} \PY{o}{=} \PY{n}{pd}\PY{o}{.}\PY{n}{read\PYZus{}csv}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{./datasets/IRIS\PYZus{}flower\PYZus{}dataset\PYZus{}kaggle.csv}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}

\PY{c+c1}{\PYZsh{} Import NOISED Iris dataset for classification}
\PY{c+c1}{\PYZsh{}irisdata\PYZus{}df = pd.read\PYZus{}csv(\PYZsq{}./datasets/IRIS\PYZus{}flower\PYZus{}dataset\PYZus{}kaggle\PYZus{}noised.csv\PYZsq{})}
\end{Verbatim}
\end{tcolorbox}

    Find \textbf{all completely identical duplicates} (first and last
occurrences). The resulting dataframe is \textbf{sorted by column}
\texttt{\textquotesingle{}species\textquotesingle{}} to get the
\textbf{duplicates grouped}:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{66}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Parameter \PYZsq{}keep=False\PYZsq{} displays all duplicate rows}
\PY{n}{irisdata\PYZus{}duplicateRows} \PY{o}{=} \PY{n}{irisdata\PYZus{}df}\PY{p}{[}\PY{n}{irisdata\PYZus{}df}\PY{o}{.}\PY{n}{duplicated}\PY{p}{(}\PY{n}{keep}\PY{o}{=}\PY{k+kc}{False}\PY{p}{)}\PY{p}{]}

\PY{c+c1}{\PYZsh{} Sort rows by column \PYZsq{}species\PYZsq{} to get the duplicates grouped}
\PY{n}{irisdata\PYZus{}duplicateRows}\PY{o}{.}\PY{n}{sort\PYZus{}values}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{species}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}

\PY{n}{str\PYZus{}caption} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Listing of all completely identical duplicates found in }\PY{l+s+se}{\PYZbs{}}
\PY{l+s+s1}{               the Iris data set}\PY{l+s+s1}{\PYZsq{}}
\PY{n}{func\PYZus{}render\PYZus{}dataframe2Markdown}\PY{p}{(}\PY{n}{irisdata\PYZus{}duplicateRows}\PY{p}{,} \PY{n}{str\PYZus{}caption}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{longtable}[]{@{}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.0602}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.1928}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.1807}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.1928}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.1807}}
  >{\raggedright\arraybackslash}p{(\columnwidth - 10\tabcolsep) * \real{0.1928}}@{}}
\caption{Listing of all completely identical duplicates found in the
Iris data set}\tabularnewline
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
species
\end{minipage} \\
\midrule\noalign{}
\endfirsthead
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedleft
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedright
species
\end{minipage} \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
9 & 4.9 & 3.1 & 1.5 & 0.1 & Iris-setosa \\
34 & 4.9 & 3.1 & 1.5 & 0.1 & Iris-setosa \\
37 & 4.9 & 3.1 & 1.5 & 0.1 & Iris-setosa \\
101 & 5.8 & 2.7 & 5.1 & 1.9 & Iris-virginica \\
142 & 5.8 & 2.7 & 5.1 & 1.9 & Iris-virginica \\
\end{longtable}

    
    Interestingly, there are indeed \textbf{duplicates} in the original
\textbf{Iris dataset}.

The duplicates occur \textbf{imbalanced} across the classes:

\begin{itemize}
\tightlist
\item
  class \textbf{Iris-setosa} has \textbf{3} identical duplicates
\item
  class \textbf{Iris-virginica} has \textbf{2} identical duplicates
\item
  class \textbf{Iris-versicolor} has \textbf{none} duplicates
\end{itemize}

This \textbf{imbalance} could cause \textbf{tendencies} and have a
(negative) effect on the \textbf{classification result}. Therefore, the
duplicates are removed and the \textbf{cleaned Iris dataset} is
\textbf{saved} as a new CSV file.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{67}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Remove duplicate rows across all columns}
\PY{n}{irisdata\PYZus{}df}\PY{o}{.}\PY{n}{drop\PYZus{}duplicates}\PY{p}{(}\PY{n}{inplace}\PY{o}{=}\PY{k+kc}{True}\PY{p}{)}
\PY{c+c1}{\PYZsh{}irisdata\PYZus{}df}
\end{Verbatim}
\end{tcolorbox}

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{68}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Count unique values without missing values in a column,}
\PY{c+c1}{\PYZsh{} ordered ascending and not normalized}
\PY{n}{irisdata\PYZus{}df}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{species}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}\PY{o}{.}\PY{n}{value\PYZus{}counts}\PY{p}{(}\PY{n}{ascending}\PY{o}{=}\PY{k+kc}{True}\PY{p}{,} \PY{n}{dropna}\PY{o}{=}\PY{k+kc}{False}\PY{p}{,} \PY{n}{normalize}\PY{o}{=}\PY{k+kc}{False}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

            \begin{tcolorbox}[breakable, size=fbox, boxrule=.5pt, pad at break*=1mm, opacityfill=0]
\prompt{Out}{outcolor}{68}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
Iris-setosa        48
Iris-virginica     49
Iris-versicolor    50
Name: species, dtype: int64
\end{Verbatim}
\end{tcolorbox}
        
    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{69}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{csv\PYZus{}filepath} \PY{o}{=} \PY{l+s+sa}{r}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{./datasets/IRIS\PYZus{}flower\PYZus{}dataset\PYZus{}kaggle\PYZus{}cleaned.csv}\PY{l+s+s1}{\PYZsq{}}

\PY{n}{irisdata\PYZus{}df}\PY{o}{.}\PY{n}{to\PYZus{}csv}\PY{p}{(}\PY{n}{csv\PYZus{}filepath}\PY{p}{,} \PY{n}{sep} \PY{o}{=}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{,}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{index} \PY{o}{=} \PY{k+kc}{False}\PY{p}{,} \PY{n}{header}\PY{o}{=}\PY{k+kc}{True}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \hypertarget{transform-the-dataset-by-feature-scaling}{%
\subsection{Transform the dataset by feature
scaling}\label{transform-the-dataset-by-feature-scaling}}

Some machine learning algorithms are very sensitive to feature scaling,
while others are virtually unaffected.

\textbf{Distance-based algorithms} such as the \textbf{Support Vector
Classifier (SVC)} explained in
\hyperref[step-3-choose-and-create-the-ml-model]{STEP 3}, or others such
as
\href{https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm}{K-Nearest
Neighbors (KNN)} and
\href{https://en.wikipedia.org/wiki/K-means_clustering}{K-means
clustering} are most affected by the \textbf{bandwidth of features}.
This is because \textbf{distances between data points} are used in the
algorithm to determine their \textbf{similarity} (see
\cite{feature_scaling_2020}).

If \textbf{features} in the dataset have very \textbf{different ranges
(scales)}, the features with the larger scale could be \textbf{weighted}
more prominently by the ML algorithm. This \textbf{tendency} would lead
to \textbf{bias} in training, which negatively affects
\textbf{generizability} in classifying test data. Therefore, the
\textbf{features} in the dataset should be \textbf{scaled} before
distance-based ML algorithms are used. Only by \textbf{adjusting the
data ranges} it is possible to ensure that \textbf{all features
contribute equally} to the classification result.

The following two subsections explain the two main methods for scaling:
\textbf{Normalization} and \textbf{Standardization}. The question often
arises as to when one or the other should be used, so here are a few
hints:

\begin{itemize}
\tightlist
\item
  \textbf{Normalization} is useful when the distribution of the data
  \textbf{does not} follow a \textbf{Gaussian normal distribution}. This
  can be helpful for algorithms that do not assume normally distributed
  data, such as K-Nearest Neighbors and neural networks. However,
  \textbf{outliers} in the data have \textbf{major influence} on the
  \textbf{mean} (not to be mistaken with the median) used for
  calculation. Therefore, normalization is significantly \textbf{more
  vulnerable to outliers} than standardization.
\item
  The \textbf{standardization}, on the other hand, can be helpful in
  cases where the data follow a \textbf{Gaussian normal distribution}.
  However, this does not necessarily have to be true. Also, unlike
  normalization, standardization does not have a limited range of
  values. However, standardization is significantly \textbf{less prone
  to outliers} in the data than normalization.
\end{itemize}

Finally, the \textbf{decision for normalization or standardization}
depends on the concrete \textbf{task}, the \textbf{data} and the
\textbf{ML algorithm} used. It is recommended to train the ML model
first with the raw data and then with the normalized or standardized
data. A subsequent comparison of the classification results (e.g., using
the
\href{https://en.wikipedia.org/wiki/Cross-validation_(statistics)}{cross-validation}
or the
\href{https://en.wikipedia.org/wiki/Root-mean-square_deviation}{root-mean-square
error (RMSE)}) provides guidance on which scaling should be used (see
\cite{feature_scaling_2020}).

At this point, an \textbf{important note} that is repeatedly mentioned
in the literature when talking about scaling (see
\cite{feature_scaling_2020}; \cite{Geron_2018}):

\begin{quote}
It is a good practice to \textbf{fit the scaler} on the \textbf{training
data} and then use it to \textbf{transform the testing data}. This would
avoid any data leakage during the model testing process. Also, the
scaling of target values is generally not required.
\end{quote}

For further details about \textbf{Standardization} and
\textbf{Normalization} read here:

scikit-learn:

\begin{itemize}
\tightlist
\item
  \href{https://scikit-learn.org/stable/modules/preprocessing.html}{Preprocessing
  data}
\item
  \href{https://scikit-learn.org/stable/auto_examples/preprocessing/plot_all_scaling.html\#sphx-glr-auto-examples-preprocessing-plot-all-scaling-py}{Compare
  the effect of different scalers on data with outliers}
\end{itemize}

Others:

\begin{itemize}
\tightlist
\item
  \href{http://techflare.blog/what-are-standarization-and-normalization-test-with-iris-data-set-in-scikit-learn/}{What
  are standarization and normalization? Test with iris data set in
  Scikit-learn}
\item
  \href{https://www.analyticsvidhya.com/blog/2020/04/feature-scaling-machine-learning-normalization-standardization/?}{Feature
  Scaling for Machine Learning: Understanding the Difference Between
  Normalization vs.~Standardization}
\item
  \href{https://en.wikipedia.org/wiki/Feature_scaling}{Feature scaling}
\item
  \href{https://en.wikipedia.org/wiki/Normalization_(statistics)}{Normalization
  (statistics)}
\item
  \href{https://en.wikipedia.org/wiki/Standard_score}{Standard score}
\end{itemize}

    \hypertarget{normalization}{%
\subsubsection{Normalization}\label{normalization}}

When scaling by \textbf{normalization}, the values are shifted and
rescaled so that they range \textbf{between 0 and 1}. It is also known
in the literature as \textbf{min-max scaling} (see
\cite{feature_scaling_2020}).

The \textbf{normalization} is calculated following this
\textbf{formula}:

\[X' = \frac{X - X_{min}}{X_{max} - X_{min}}\]

Thereby \(X_{max}\) and \(X_{min}\) are the maximum and the minimum
values of the feature, respectively.

\begin{itemize}
\tightlist
\item
  If the value of \(X\) is the \textbf{minimum value} of the feature,
  the numerator in the fraction becomes 0. Thus, \(X' = 0\).
\item
  If the value of \(X\) is the \textbf{maximum value} of the feature,
  the numerator becomes equal to the denominator of the fraction. Then
  the value of \(X' = 1\).
\item
  If the value of \(X\) is between the minimum and maximum values, the
  value of \(X'\) is between 0 and 1.
\end{itemize}

For further details read here:
\href{https://scikit-learn.org/stable/modules/preprocessing.html\#normalization}{scikit-learn:
Normalization}.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{70}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{k+kn}{from} \PY{n+nn}{sklearn}\PY{n+nn}{.}\PY{n+nn}{preprocessing} \PY{k+kn}{import} \PY{n}{MinMaxScaler}

\PY{c+c1}{\PYZsh{} Fit the MinMax scaler on raw input dataframe}
\PY{c+c1}{\PYZsh{} by selecting columns 1\PYZhy{}4 with all feature rows}
\PY{c+c1}{\PYZsh{}norm\PYZus{}scaler = MinMaxScaler().fit(irisdata\PYZus{}df.iloc[:, 0:4])}
\PY{c+c1}{\PYZsh{} by ommitting the last column with class names}
\PY{n}{norm\PYZus{}scaler} \PY{o}{=} \PY{n}{MinMaxScaler}\PY{p}{(}\PY{p}{)}\PY{o}{.}\PY{n}{fit}\PY{p}{(}\PY{n}{irisdata\PYZus{}df}\PY{o}{.}\PY{n}{drop}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{species}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{axis}\PY{o}{=}\PY{l+m+mi}{1}\PY{p}{)}\PY{p}{)}

\PY{c+c1}{\PYZsh{} Transform the raw dateframe with fitted scaler}
\PY{c+c1}{\PYZsh{} and convert it to numpy array}
\PY{c+c1}{\PYZsh{}irisdata\PYZus{}np\PYZus{}norm = norm\PYZus{}scaler.transform(irisdata\PYZus{}df.iloc[:, 0:4])}
\PY{n}{irisdata\PYZus{}np\PYZus{}norm} \PY{o}{=} \PY{n}{norm\PYZus{}scaler}\PY{o}{.}\PY{n}{transform}\PY{p}{(}\PY{n}{irisdata\PYZus{}df}\PY{o}{.}\PY{n}{drop}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{species}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{axis}\PY{o}{=}\PY{l+m+mi}{1}\PY{p}{)}\PY{p}{)}

\PY{c+c1}{\PYZsh{}irisdata\PYZus{}np\PYZus{}norm}
\end{Verbatim}
\end{tcolorbox}

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{71}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Make a deep copy of original dataframe}
\PY{n}{irisdata\PYZus{}df\PYZus{}norm} \PY{o}{=} \PY{n}{irisdata\PYZus{}df}\PY{o}{.}\PY{n}{copy}\PY{p}{(}\PY{n}{deep}\PY{o}{=}\PY{k+kc}{True}\PY{p}{)}

\PY{c+c1}{\PYZsh{} Replace values of dataframe with normalized values from array}
\PY{c+c1}{\PYZsh{} by writing in the first 4 columns (ommit last column with class names)}
\PY{n}{irisdata\PYZus{}df\PYZus{}norm}\PY{o}{.}\PY{n}{iloc}\PY{p}{[}\PY{p}{:}\PY{p}{,} \PY{l+m+mi}{0}\PY{p}{:}\PY{l+m+mi}{4}\PY{p}{]} \PY{o}{=} \PY{n}{irisdata\PYZus{}np\PYZus{}norm}

\PY{c+c1}{\PYZsh{}irisdata\PYZus{}df\PYZus{}norm}
\end{Verbatim}
\end{tcolorbox}

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{72}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{str\PYZus{}caption} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Get some basic statistical data of the normalized Iris dataframe}\PY{l+s+s1}{\PYZsq{}}
\PY{n}{func\PYZus{}render\PYZus{}dataframe2Markdown}\PY{p}{(}\PY{n}{irisdata\PYZus{}df\PYZus{}norm}\PY{o}{.}\PY{n}{describe}\PY{p}{(}\PY{p}{)}\PY{p}{,} \PY{n}{str\PYZus{}caption}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{longtable}[]{@{}
  >{\raggedright\arraybackslash}p{(\columnwidth - 8\tabcolsep) * \real{0.1014}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 8\tabcolsep) * \real{0.2319}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 8\tabcolsep) * \real{0.2174}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 8\tabcolsep) * \real{0.2319}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 8\tabcolsep) * \real{0.2174}}@{}}
\caption{Get some basic statistical data of the normalized Iris
dataframe}\tabularnewline
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedright
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_width
\end{minipage} \\
\midrule\noalign{}
\endfirsthead
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedright
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_width
\end{minipage} \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
count & 147 & 147 & 147 & 147 \\
mean & 0.432351 & 0.439909 & 0.471233 & 0.462018 \\
std & 0.230306 & 0.182087 & 0.298154 & 0.315781 \\
min & 0 & 0 & 0 & 0 \\
25\% & 0.222222 & 0.333333 & 0.101695 & 0.0833333 \\
50\% & 0.416667 & 0.416667 & 0.576271 & 0.5 \\
75\% & 0.583333 & 0.541667 & 0.694915 & 0.708333 \\
max & 1 & 1 & 1 & 1 \\
\end{longtable}

    
    To display the \textbf{original data} with the \textbf{scaled data}
side-by-side as \textbf{boxplots} with all \textbf{features in one
scale}, the function \texttt{func\_boxplots\_comp\_scaling()} is
implemented.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{73}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{k}{def} \PY{n+nf}{func\PYZus{}boxplots\PYZus{}comp\PYZus{}scaling}\PY{p}{(}\PY{n}{dataframes}\PY{p}{,} \PY{n}{titles}\PY{p}{)}\PY{p}{:}
    \PY{n}{fig}\PY{p}{,} \PY{n}{subplots} \PY{o}{=} \PY{n}{plt}\PY{o}{.}\PY{n}{subplots}\PY{p}{(}\PY{l+m+mi}{1}\PY{p}{,} \PY{l+m+mi}{2}\PY{p}{,} \PY{n}{figsize}\PY{o}{=}\PY{p}{(}\PY{l+m+mi}{12}\PY{p}{,} \PY{l+m+mi}{5}\PY{p}{)}\PY{p}{)}
    \PY{c+c1}{\PYZsh{} Set margins between subplots}
    \PY{n}{plt}\PY{o}{.}\PY{n}{subplots\PYZus{}adjust}\PY{p}{(}\PY{n}{wspace}\PY{o}{=}\PY{l+m+mf}{0.3}\PY{p}{,} \PY{n}{hspace}\PY{o}{=}\PY{l+m+mf}{0.3}\PY{p}{)}

    \PY{c+c1}{\PYZsh{} Make subplots iterable via \PYZsq{}subplots.flatten()\PYZsq{}}
    \PY{k}{for} \PY{n}{title}\PY{p}{,} \PY{n}{df}\PY{p}{,} \PY{n}{subplot} \PY{o+ow}{in} \PY{n+nb}{zip}\PY{p}{(}\PY{n}{titles}\PY{p}{,} \PY{n}{dataframes}\PY{p}{,} \PY{n}{subplots}\PY{o}{.}\PY{n}{flatten}\PY{p}{(}\PY{p}{)}\PY{p}{)}\PY{p}{:}

        \PY{c+c1}{\PYZsh{} To create multiple boxplots in seaborn,}
        \PY{c+c1}{\PYZsh{} we must first melt the pandas.DataFrame into a long format:}
        \PY{n}{df\PYZus{}melted} \PY{o}{=} \PY{n}{pd}\PY{o}{.}\PY{n}{melt}\PY{p}{(}\PY{n}{df}\PY{o}{.}\PY{n}{drop}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{species}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{axis}\PY{o}{=}\PY{l+m+mi}{1}\PY{p}{)}\PY{p}{)}
        \PY{c+c1}{\PYZsh{} Rename column to \PYZsq{}features\PYZsq{}}
        \PY{n}{df\PYZus{}melted}\PY{o}{.}\PY{n}{rename}\PY{p}{(}\PY{n}{columns}\PY{o}{=}\PY{p}{\PYZob{}}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{variable}\PY{l+s+s1}{\PYZsq{}}\PY{p}{:} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{features}\PY{l+s+s1}{\PYZsq{}}\PY{p}{\PYZcb{}}\PY{p}{,} \PY{n}{inplace}\PY{o}{=}\PY{k+kc}{True}\PY{p}{)}

        \PY{n}{sns}\PY{o}{.}\PY{n}{boxplot}\PY{p}{(}\PY{n}{x}\PY{o}{=}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{features}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{y}\PY{o}{=}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{value}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{data}\PY{o}{=}\PY{n}{df\PYZus{}melted}\PY{p}{,} \PY{n}{ax} \PY{o}{=} \PY{n}{subplot}\PY{p}{)}
        \PY{c+c1}{\PYZsh{} Show grid}
        \PY{n}{subplot}\PY{o}{.}\PY{n}{grid}\PY{p}{(}\PY{n}{axis}\PY{o}{=}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{y}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}
        \PY{c+c1}{\PYZsh{} Hide grid behind the bars}
        \PY{n}{subplot}\PY{o}{.}\PY{n}{set\PYZus{}axisbelow}\PY{p}{(}\PY{k+kc}{True}\PY{p}{)}
        \PY{c+c1}{\PYZsh{} Set the title of the boxplot}
        \PY{c+c1}{\PYZsh{} pad ... defines the distance of the title from the top of the boxplot}
        \PY{n}{subplot}\PY{o}{.}\PY{n}{set\PYZus{}title}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+si}{\PYZob{}\PYZcb{}}\PY{l+s+s1}{ data}\PY{l+s+s1}{\PYZsq{}}\PY{o}{.}\PY{n}{format}\PY{p}{(}\PY{n}{title}\PY{p}{)}\PY{p}{,} \PY{n}{pad}\PY{o}{=}\PY{l+m+mi}{10}\PY{p}{)}
        \PY{n}{subplot}\PY{o}{.}\PY{n}{set\PYZus{}ylabel}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{value range [cm]}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}

    \PY{n}{plt}\PY{o}{.}\PY{n}{show}\PY{p}{(}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    Call the new function \texttt{func\_boxplots\_comp\_scaling()} to create
the \textbf{boxplots} comparing \textbf{original data} with the
\textbf{scaled data} side-by-side:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{74}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{titles}     \PY{o}{=} \PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Original}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Normalized}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}
\PY{n}{dataframes} \PY{o}{=} \PY{p}{[}\PY{n}{irisdata\PYZus{}df}\PY{p}{,} \PY{n}{irisdata\PYZus{}df\PYZus{}norm}\PY{p}{]}

\PY{n}{func\PYZus{}boxplots\PYZus{}comp\PYZus{}scaling}\PY{p}{(}\PY{n}{dataframes}\PY{p}{,} \PY{n}{titles}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{figure}
        \begin{center}\adjustimage{max size={0.9\linewidth}{0.9\paperheight}}{Step-by-step_intro_to_ML_with_SVC_and_Iris_files/Step-by-step_intro_to_ML_with_SVC_and_Iris_196_0.png}\end{center}
        \caption{Boxplots comparing the original data (left) with the normalized data (right) with all features in one scale}
        \label{fig:boxplots_comp_orig_norm}
    \end{figure}
    
    To compare the \textbf{original data} with the \textbf{scaled data}
side-by-side as \textbf{histograms} with overlaid \textbf{probability
density functions}, the function
\texttt{func\_histograms\_comp\_scaling()} is implemented.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{75}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{k+kn}{from} \PY{n+nn}{scipy}\PY{n+nn}{.}\PY{n+nn}{stats} \PY{k+kn}{import} \PY{n}{norm}

\PY{k}{def} \PY{n+nf}{func\PYZus{}histograms\PYZus{}comp\PYZus{}scaling}\PY{p}{(}\PY{n}{df\PYZus{}orig}\PY{p}{,} \PY{n}{df\PYZus{}scaled}\PY{p}{,} \PY{n}{features}\PY{p}{,}
                                 \PY{n}{titles}\PY{p}{,} \PY{n}{scaling\PYZus{}type}\PY{p}{)}\PY{p}{:}
    \PY{c+c1}{\PYZsh{} Number of bins for the histogram}
    \PY{c+c1}{\PYZsh{} \PYZhy{} bins=\PYZlt{}integer\PYZgt{}: defines the number of equal\PYZhy{}width bins in the range}
    \PY{c+c1}{\PYZsh{} \PYZhy{} bins=\PYZlt{}string\PYZgt{}: one of the binning strategies is used:}
    \PY{c+c1}{\PYZsh{}   \PYZsq{}auto\PYZsq{}, \PYZsq{}fd\PYZsq{}, \PYZsq{}doane\PYZsq{}, \PYZsq{}scott\PYZsq{}, \PYZsq{}stone\PYZsq{}, \PYZsq{}rice\PYZsq{}, \PYZsq{}sturges\PYZsq{}, or \PYZsq{}sqrt\PYZsq{}}
    \PY{n}{n\PYZus{}bins} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{auto}\PY{l+s+s1}{\PYZsq{}}
    \PY{c+c1}{\PYZsh{}n\PYZus{}bins = 10}
    \PY{n}{fig}\PY{p}{,} \PY{n}{subplots} \PY{o}{=} \PY{n}{plt}\PY{o}{.}\PY{n}{subplots}\PY{p}{(}\PY{l+m+mi}{4}\PY{p}{,} \PY{l+m+mi}{2}\PY{p}{,} \PY{n}{figsize}\PY{o}{=}\PY{p}{(}\PY{l+m+mi}{12}\PY{p}{,} \PY{l+m+mi}{18}\PY{p}{)}\PY{p}{)}
    \PY{c+c1}{\PYZsh{} Set margins between subplots}
    \PY{n}{plt}\PY{o}{.}\PY{n}{subplots\PYZus{}adjust}\PY{p}{(}\PY{n}{wspace}\PY{o}{=}\PY{l+m+mf}{0.3}\PY{p}{,} \PY{n}{hspace}\PY{o}{=}\PY{l+m+mf}{0.4}\PY{p}{)}

    \PY{c+c1}{\PYZsh{} Make subplots iterable via \PYZsq{}subplots.flatten()\PYZsq{}}
    \PY{n}{subplot\PYZus{}list} \PY{o}{=} \PY{n}{subplots}\PY{o}{.}\PY{n}{flatten}\PY{p}{(}\PY{p}{)}

    \PY{n}{subplots\PYZus{}orig}   \PY{o}{=} \PY{p}{[}\PY{l+m+mi}{0}\PY{p}{,} \PY{l+m+mi}{2}\PY{p}{,} \PY{l+m+mi}{4}\PY{p}{,} \PY{l+m+mi}{6}\PY{p}{]}
    \PY{n}{subplots\PYZus{}scaled} \PY{o}{=} \PY{p}{[}\PY{l+m+mi}{1}\PY{p}{,} \PY{l+m+mi}{3}\PY{p}{,} \PY{l+m+mi}{5}\PY{p}{,} \PY{l+m+mi}{7}\PY{p}{]}

    \PY{c+c1}{\PYZsh{} Show histograms with ORIGINAL data,}
    \PY{c+c1}{\PYZsh{} so loop through list of subplots with EVEN indexes}
    \PY{k}{for} \PY{n}{feature}\PY{p}{,} \PY{n}{title}\PY{p}{,} \PY{n}{index\PYZus{}subplt} \PY{o+ow}{in} \PY{n+nb}{zip}\PY{p}{(}\PY{n}{features}\PY{p}{,} \PY{n}{titles}\PY{p}{,} \PY{n}{subplots\PYZus{}orig}\PY{p}{)}\PY{p}{:}
        \PY{n}{subplot\PYZus{}list}\PY{p}{[}\PY{n}{index\PYZus{}subplt}\PY{p}{]}\PY{o}{.}\PY{n}{hist}\PY{p}{(}\PY{n}{df\PYZus{}orig}\PY{p}{[}\PY{n}{feature}\PY{p}{]}\PY{p}{,}
                                        \PY{n}{bins} \PY{o}{=} \PY{n}{n\PYZus{}bins}\PY{p}{,} \PY{n}{rwidth}\PY{o}{=}\PY{l+m+mf}{0.95}\PY{p}{,}
                                        \PY{n}{density}\PY{o}{=}\PY{k+kc}{True}\PY{p}{,} \PY{n}{alpha}\PY{o}{=}\PY{l+m+mf}{0.8}\PY{p}{)}

        \PY{c+c1}{\PYZsh{} Fit a normal distribution to the data}
        \PY{c+c1}{\PYZsh{} with mean and standard deviation}
        \PY{n}{mu}\PY{p}{,} \PY{n}{std} \PY{o}{=} \PY{n}{norm}\PY{o}{.}\PY{n}{fit}\PY{p}{(}\PY{n}{df\PYZus{}orig}\PY{p}{[}\PY{n}{feature}\PY{p}{]}\PY{p}{)}

        \PY{c+c1}{\PYZsh{} Plot the probability density function (PDF)}
        \PY{n}{xmin}\PY{p}{,} \PY{n}{xmax} \PY{o}{=} \PY{n}{subplot\PYZus{}list}\PY{p}{[}\PY{n}{index\PYZus{}subplt}\PY{p}{]}\PY{o}{.}\PY{n}{get\PYZus{}xlim}\PY{p}{(}\PY{p}{)}
        \PY{n}{x} \PY{o}{=} \PY{n}{np}\PY{o}{.}\PY{n}{linspace}\PY{p}{(}\PY{n}{xmin}\PY{p}{,} \PY{n}{xmax}\PY{p}{,} \PY{l+m+mi}{100}\PY{p}{)}
        \PY{n}{p} \PY{o}{=} \PY{n}{norm}\PY{o}{.}\PY{n}{pdf}\PY{p}{(}\PY{n}{x}\PY{p}{,} \PY{n}{mu}\PY{p}{,} \PY{n}{std}\PY{p}{)}
        \PY{n}{subplot\PYZus{}list}\PY{p}{[}\PY{n}{index\PYZus{}subplt}\PY{p}{]}\PY{o}{.}\PY{n}{plot}\PY{p}{(}\PY{n}{x}\PY{p}{,} \PY{n}{p}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{k}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{linewidth}\PY{o}{=}\PY{l+m+mi}{2}\PY{p}{)}

        \PY{n}{title\PYZus{}concat} \PY{o}{=} \PY{l+s+s2}{\PYZdq{}}\PY{l+s+si}{\PYZob{}\PYZcb{}}\PY{l+s+s2}{ (Mean: }\PY{l+s+si}{\PYZob{}:.2f\PYZcb{}}\PY{l+s+s2}{, }\PY{l+s+s2}{\PYZdq{}} \PYZbs{}
                       \PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{Std. deviation: }\PY{l+s+si}{\PYZob{}:.2f\PYZcb{}}\PY{l+s+s2}{)}\PY{l+s+s2}{\PYZdq{}}\PY{o}{.}\PY{n}{format}\PY{p}{(}\PY{n}{title}\PY{p}{,} \PY{n}{mu}\PY{p}{,} \PY{n}{std}\PY{p}{)}
        \PY{c+c1}{\PYZsh{} Set the title of the histogram}
        \PY{c+c1}{\PYZsh{} pad ... defines the distance of the title from the top of the histogram}
        \PY{n}{subplot\PYZus{}list}\PY{p}{[}\PY{n}{index\PYZus{}subplt}\PY{p}{]}\PY{o}{.}\PY{n}{set\PYZus{}title}\PY{p}{(}\PY{n}{title\PYZus{}concat}\PY{p}{,} \PY{n}{pad}\PY{o}{=}\PY{l+m+mi}{10}\PY{p}{)}
        \PY{c+c1}{\PYZsh{} Show grid}
        \PY{n}{subplot\PYZus{}list}\PY{p}{[}\PY{n}{index\PYZus{}subplt}\PY{p}{]}\PY{o}{.}\PY{n}{grid}\PY{p}{(}\PY{n}{visible}\PY{o}{=}\PY{k+kc}{True}\PY{p}{)}
        \PY{c+c1}{\PYZsh{} Hide grid behind the bars}
        \PY{n}{subplot\PYZus{}list}\PY{p}{[}\PY{n}{index\PYZus{}subplt}\PY{p}{]}\PY{o}{.}\PY{n}{set\PYZus{}axisbelow}\PY{p}{(}\PY{k+kc}{True}\PY{p}{)}
        \PY{c+c1}{\PYZsh{} Label x and y\PYZhy{}axis}
        \PY{n}{subplot\PYZus{}list}\PY{p}{[}\PY{n}{index\PYZus{}subplt}\PY{p}{]}\PY{o}{.}\PY{n}{set\PYZus{}xlabel}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{original value range [cm]}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}
        \PY{n}{subplot\PYZus{}list}\PY{p}{[}\PY{n}{index\PYZus{}subplt}\PY{p}{]}\PY{o}{.}\PY{n}{set\PYZus{}ylabel}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{frequency density (relative)}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}

    \PY{c+c1}{\PYZsh{} Show histograms with SCALED data,}
    \PY{c+c1}{\PYZsh{} so loop through list of subplots with ODD indexes}
    \PY{k}{for} \PY{n}{feature}\PY{p}{,} \PY{n}{title}\PY{p}{,} \PY{n}{index\PYZus{}subplt} \PY{o+ow}{in} \PY{n+nb}{zip}\PY{p}{(}\PY{n}{features}\PY{p}{,} \PY{n}{titles}\PY{p}{,} \PY{n}{subplots\PYZus{}scaled}\PY{p}{)}\PY{p}{:}
        \PY{n}{subplot\PYZus{}list}\PY{p}{[}\PY{n}{index\PYZus{}subplt}\PY{p}{]}\PY{o}{.}\PY{n}{hist}\PY{p}{(}\PY{n}{df\PYZus{}scaled}\PY{p}{[}\PY{n}{feature}\PY{p}{]}\PY{p}{,}
                                        \PY{n}{bins} \PY{o}{=} \PY{n}{n\PYZus{}bins}\PY{p}{,} \PY{n}{rwidth}\PY{o}{=}\PY{l+m+mf}{0.95}\PY{p}{,}
                                        \PY{n}{density}\PY{o}{=}\PY{k+kc}{True}\PY{p}{,} \PY{n}{alpha}\PY{o}{=}\PY{l+m+mf}{0.8}\PY{p}{)}

        \PY{c+c1}{\PYZsh{} Fit a normal distribution to the data}
        \PY{c+c1}{\PYZsh{} with mean and standard deviation}
        \PY{n}{mu}\PY{p}{,} \PY{n}{std} \PY{o}{=} \PY{n}{norm}\PY{o}{.}\PY{n}{fit}\PY{p}{(}\PY{n}{df\PYZus{}scaled}\PY{p}{[}\PY{n}{feature}\PY{p}{]}\PY{p}{)}

        \PY{c+c1}{\PYZsh{} Plot the probability density function (PDF)}
        \PY{n}{xmin}\PY{p}{,} \PY{n}{xmax} \PY{o}{=} \PY{n}{subplot\PYZus{}list}\PY{p}{[}\PY{n}{index\PYZus{}subplt}\PY{p}{]}\PY{o}{.}\PY{n}{get\PYZus{}xlim}\PY{p}{(}\PY{p}{)}
        \PY{n}{x} \PY{o}{=} \PY{n}{np}\PY{o}{.}\PY{n}{linspace}\PY{p}{(}\PY{n}{xmin}\PY{p}{,} \PY{n}{xmax}\PY{p}{,} \PY{l+m+mi}{100}\PY{p}{)}
        \PY{n}{p} \PY{o}{=} \PY{n}{norm}\PY{o}{.}\PY{n}{pdf}\PY{p}{(}\PY{n}{x}\PY{p}{,} \PY{n}{mu}\PY{p}{,} \PY{n}{std}\PY{p}{)}
        \PY{n}{subplot\PYZus{}list}\PY{p}{[}\PY{n}{index\PYZus{}subplt}\PY{p}{]}\PY{o}{.}\PY{n}{plot}\PY{p}{(}\PY{n}{x}\PY{p}{,} \PY{n}{p}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{k}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{linewidth}\PY{o}{=}\PY{l+m+mi}{2}\PY{p}{)}

        \PY{n}{title\PYZus{}concat} \PY{o}{=} \PY{l+s+s2}{\PYZdq{}}\PY{l+s+si}{\PYZob{}\PYZcb{}}\PY{l+s+s2}{ (Mean: }\PY{l+s+si}{\PYZob{}:.2f\PYZcb{}}\PY{l+s+s2}{, }\PY{l+s+s2}{\PYZdq{}} \PYZbs{}
                       \PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{Std. deviation: }\PY{l+s+si}{\PYZob{}:.2f\PYZcb{}}\PY{l+s+s2}{)}\PY{l+s+s2}{\PYZdq{}}\PY{o}{.}\PY{n}{format}\PY{p}{(}\PY{n}{title}\PY{p}{,} \PY{n}{mu}\PY{p}{,} \PY{n}{std}\PY{p}{)}
        \PY{c+c1}{\PYZsh{} Set the title of the histogram}
        \PY{c+c1}{\PYZsh{} pad ... defines the distance of the title from the top of the histogram}
        \PY{n}{subplot\PYZus{}list}\PY{p}{[}\PY{n}{index\PYZus{}subplt}\PY{p}{]}\PY{o}{.}\PY{n}{set\PYZus{}title}\PY{p}{(}\PY{n}{title\PYZus{}concat}\PY{p}{,} \PY{n}{pad}\PY{o}{=}\PY{l+m+mi}{10}\PY{p}{)}
        \PY{c+c1}{\PYZsh{} Show grid}
        \PY{n}{subplot\PYZus{}list}\PY{p}{[}\PY{n}{index\PYZus{}subplt}\PY{p}{]}\PY{o}{.}\PY{n}{grid}\PY{p}{(}\PY{n}{visible}\PY{o}{=}\PY{k+kc}{True}\PY{p}{)}
        \PY{c+c1}{\PYZsh{} Hide grid behind the bars}
        \PY{n}{subplot\PYZus{}list}\PY{p}{[}\PY{n}{index\PYZus{}subplt}\PY{p}{]}\PY{o}{.}\PY{n}{set\PYZus{}axisbelow}\PY{p}{(}\PY{k+kc}{True}\PY{p}{)}
        \PY{c+c1}{\PYZsh{} Label x and y\PYZhy{}axis}
        \PY{n}{subplot\PYZus{}list}\PY{p}{[}\PY{n}{index\PYZus{}subplt}\PY{p}{]}\PY{o}{.}\PY{n}{set\PYZus{}xlabel}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+si}{\PYZob{}\PYZcb{}}\PY{l+s+s1}{ value range [cm]}\PY{l+s+s1}{\PYZsq{}}
                                              \PY{o}{.}\PY{n}{format}\PY{p}{(}\PY{n}{scaling\PYZus{}type}\PY{p}{)}\PY{p}{)}
        \PY{n}{subplot\PYZus{}list}\PY{p}{[}\PY{n}{index\PYZus{}subplt}\PY{p}{]}\PY{o}{.}\PY{n}{set\PYZus{}ylabel}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{frequency density (relative)}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}

    \PY{n}{plt}\PY{o}{.}\PY{n}{show}\PY{p}{(}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    Call the new function \texttt{func\_histograms\_comp\_scaling()} to
create the \textbf{histograms} with overlaid \textbf{probability density
functions} comparing \textbf{original data} with the \textbf{normalized
data} side-by-side:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{76}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{features} \PY{o}{=} \PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{sepal\PYZus{}length}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{sepal\PYZus{}width}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{petal\PYZus{}length}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{petal\PYZus{}width}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}
\PY{n}{titles} \PY{o}{=}   \PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Sepal Length}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Sepal Width}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Petal Length}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Petal Width}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}

\PY{n}{func\PYZus{}histograms\PYZus{}comp\PYZus{}scaling}\PY{p}{(}\PY{n}{irisdata\PYZus{}df}\PY{p}{,} \PY{n}{irisdata\PYZus{}df\PYZus{}norm}\PY{p}{,}
                             \PY{n}{features}\PY{p}{,} \PY{n}{titles}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{normalized}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{figure}
        \begin{center}\adjustimage{max size={0.9\linewidth}{0.9\paperheight}}{Step-by-step_intro_to_ML_with_SVC_and_Iris_files/Step-by-step_intro_to_ML_with_SVC_and_Iris_200_0.png}\end{center}
        \caption{Histograms with overlaid probability density functions comparing original data (left) with the normalized data (right)}
        \label{fig:histograms_comp_orig_norm}
    \end{figure}
    
    \hypertarget{standardization}{%
\subsubsection{Standardization}\label{standardization}}

\textbf{Standardization} is another scaling technique in which
\textbf{values are centered around the mean} with a \textbf{unit
standard deviation}. That is, the \textbf{mean} of the feature
\textbf{becomes zero} and the resulting \textbf{distribution} has a
\textbf{unit standard deviation} (see \cite{feature_scaling_2020}).

The \textbf{standardization} is calculated following this
\textbf{formula}:

\[X' = \frac{X - \mu}{\sigma}\]

\begin{itemize}
\tightlist
\item
  \(\mu\) is the \textbf{mean} of the feature values and
\item
  \(\sigma\) is the \textbf{standard deviation} of the feature values.
\end{itemize}

It should be noted that the values after standardization are not limited
to a certain range (unlike normalization).

For further details read here:
\href{https://scikit-learn.org/stable/modules/preprocessing.html\#standardization-or-mean-removal-and-variance-scaling}{scikit-learn:
Standardization, or mean removal and variance scaling}.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{77}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{k+kn}{from} \PY{n+nn}{sklearn}\PY{n+nn}{.}\PY{n+nn}{preprocessing} \PY{k+kn}{import} \PY{n}{StandardScaler}

\PY{c+c1}{\PYZsh{} Fit the standard scaler on raw input dataframe}
\PY{c+c1}{\PYZsh{} by selecting columns 1\PYZhy{}4 with all feature rows}
\PY{c+c1}{\PYZsh{}std\PYZus{}scaler = StandardScaler().fit(irisdata\PYZus{}df.iloc[:, 0:4])}
\PY{c+c1}{\PYZsh{} by ommitting the last column with class names}
\PY{n}{std\PYZus{}scaler} \PY{o}{=} \PY{n}{StandardScaler}\PY{p}{(}\PY{p}{)}\PY{o}{.}\PY{n}{fit}\PY{p}{(}\PY{n}{irisdata\PYZus{}df}\PY{o}{.}\PY{n}{drop}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{species}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{axis}\PY{o}{=}\PY{l+m+mi}{1}\PY{p}{)}\PY{p}{)}

\PY{c+c1}{\PYZsh{} Transform the raw dateframe with fitted scaler}
\PY{c+c1}{\PYZsh{} and convert it to numpy array}
\PY{c+c1}{\PYZsh{}irisdata\PYZus{}np\PYZus{}std = std\PYZus{}scaler.transform(irisdata\PYZus{}df.iloc[:, 0:4])}
\PY{n}{irisdata\PYZus{}np\PYZus{}std} \PY{o}{=} \PY{n}{std\PYZus{}scaler}\PY{o}{.}\PY{n}{transform}\PY{p}{(}\PY{n}{irisdata\PYZus{}df}\PY{o}{.}\PY{n}{drop}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{species}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{axis}\PY{o}{=}\PY{l+m+mi}{1}\PY{p}{)}\PY{p}{)}

\PY{c+c1}{\PYZsh{}irisdata\PYZus{}np\PYZus{}std}
\end{Verbatim}
\end{tcolorbox}

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{78}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Make a deep copy of original dataframe}
\PY{n}{irisdata\PYZus{}df\PYZus{}std} \PY{o}{=} \PY{n}{irisdata\PYZus{}df}\PY{o}{.}\PY{n}{copy}\PY{p}{(}\PY{n}{deep}\PY{o}{=}\PY{k+kc}{True}\PY{p}{)}

\PY{c+c1}{\PYZsh{} Replace values of dataframe with standardized values from array}
\PY{n}{irisdata\PYZus{}df\PYZus{}std}\PY{o}{.}\PY{n}{iloc}\PY{p}{[}\PY{p}{:}\PY{p}{,} \PY{l+m+mi}{0}\PY{p}{:}\PY{l+m+mi}{4}\PY{p}{]} \PY{o}{=} \PY{n}{irisdata\PYZus{}np\PYZus{}std}

\PY{c+c1}{\PYZsh{}irisdata\PYZus{}df\PYZus{}std}
\end{Verbatim}
\end{tcolorbox}

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{79}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{str\PYZus{}caption} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Get some basic statistical data of the standardized Iris dataframe}\PY{l+s+s1}{\PYZsq{}}
\PY{n}{func\PYZus{}render\PYZus{}dataframe2Markdown}\PY{p}{(}\PY{n}{irisdata\PYZus{}df\PYZus{}std}\PY{o}{.}\PY{n}{describe}\PY{p}{(}\PY{p}{)}\PY{p}{,} \PY{n}{str\PYZus{}caption}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{longtable}[]{@{}
  >{\raggedright\arraybackslash}p{(\columnwidth - 8\tabcolsep) * \real{0.1014}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 8\tabcolsep) * \real{0.2319}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 8\tabcolsep) * \real{0.2174}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 8\tabcolsep) * \real{0.2319}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 8\tabcolsep) * \real{0.2174}}@{}}
\caption{Get some basic statistical data of the standardized Iris
dataframe}\tabularnewline
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedright
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_width
\end{minipage} \\
\midrule\noalign{}
\endfirsthead
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedright
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_width
\end{minipage} \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
count & 147 & 147 & 147 & 147 \\
mean & -8.97808e-17 & 1.18952e-17 & -3.17962e-16 & -3.57235e-16 \\
std & 1.00342 & 1.00342 & 1.00342 & 1.00342 \\
min & -1.88371 & -2.42419 & -1.5859 & -1.4681 \\
25\% & -0.915509 & -0.587304 & -1.24365 & -1.2033 \\
50\% & -0.0683339 & -0.128082 & 0.353501 & 0.12069 \\
75\% & 0.657817 & 0.56075 & 0.752789 & 0.782686 \\
max & 2.47319 & 3.08647 & 1.77953 & 1.70948 \\
\end{longtable}

    
    As in the previous section, the \textbf{original} and the
\textbf{standardized data} are plotted as side-by-side \textbf{boxplots}
with all \textbf{features at one scale}.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{80}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{titles}     \PY{o}{=} \PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Original}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Standardized}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}
\PY{n}{dataframes} \PY{o}{=} \PY{p}{[}\PY{n}{irisdata\PYZus{}df}\PY{p}{,} \PY{n}{irisdata\PYZus{}df\PYZus{}std}\PY{p}{]}

\PY{n}{func\PYZus{}boxplots\PYZus{}comp\PYZus{}scaling}\PY{p}{(}\PY{n}{dataframes}\PY{p}{,} \PY{n}{titles}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{figure}
        \begin{center}\adjustimage{max size={0.9\linewidth}{0.9\paperheight}}{Step-by-step_intro_to_ML_with_SVC_and_Iris_files/Step-by-step_intro_to_ML_with_SVC_and_Iris_206_0.png}\end{center}
        \caption{Boxplots comparing the original data (left) with the standardized data (right) with all features in one scale}
        \label{fig:boxplots_comp_orig_std}
    \end{figure}
    
    As in the previous section, the \textbf{original} and the
\textbf{standardized data} are plotted as side-by-side
\textbf{histograms} with overlaid \textbf{probability density
functions}.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{81}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{features} \PY{o}{=} \PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{sepal\PYZus{}length}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{sepal\PYZus{}width}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{petal\PYZus{}length}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{petal\PYZus{}width}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}
\PY{n}{titles} \PY{o}{=}   \PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Sepal Length}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Sepal Width}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Petal Length}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Petal Width}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}

\PY{n}{func\PYZus{}histograms\PYZus{}comp\PYZus{}scaling}\PY{p}{(}\PY{n}{irisdata\PYZus{}df}\PY{p}{,} \PY{n}{irisdata\PYZus{}df\PYZus{}std}\PY{p}{,}
                             \PY{n}{features}\PY{p}{,} \PY{n}{titles}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{standardized}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{figure}
        \begin{center}\adjustimage{max size={0.9\linewidth}{0.9\paperheight}}{Step-by-step_intro_to_ML_with_SVC_and_Iris_files/Step-by-step_intro_to_ML_with_SVC_and_Iris_208_0.png}\end{center}
        \caption{Histograms with overlaid probability density functions comparing original data (left) with the standardized data (right)}
        \label{fig:histograms_comp_orig_std}
    \end{figure}
    
    \hypertarget{step-5-carry-out-training-prediction-and-testing}{%
\section{STEP 5: Carry out training, prediction and
testing}\label{step-5-carry-out-training-prediction-and-testing}}

To avoid errors, the Iris dataset is imported again:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{82}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Import ORIGINAL Iris dataset for classification}
\PY{c+c1}{\PYZsh{}irisdata\PYZus{}df = pd.read\PYZus{}csv(\PYZsq{}./datasets/IRIS\PYZus{}flower\PYZus{}dataset\PYZus{}kaggle.csv\PYZsq{})}

\PY{c+c1}{\PYZsh{} Import CLEANED Iris dataset for classification (removed duplicates)}
\PY{n}{irisdata\PYZus{}df} \PY{o}{=} \PY{n}{pd}\PY{o}{.}\PY{n}{read\PYZus{}csv}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{./datasets/IRIS\PYZus{}flower\PYZus{}dataset\PYZus{}kaggle\PYZus{}cleaned.csv}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}

\PY{c+c1}{\PYZsh{} Import NOISED Iris dataset for classification}
\PY{c+c1}{\PYZsh{}irisdata\PYZus{}df = pd.read\PYZus{}csv(\PYZsq{}./datasets/IRIS\PYZus{}flower\PYZus{}dataset\PYZus{}kaggle\PYZus{}noised.csv\PYZsq{})}
\end{Verbatim}
\end{tcolorbox}

    \hypertarget{split-the-dataset}{%
\subsection{Split the dataset}\label{split-the-dataset}}

In the next very important step, the dataset is split into \textbf{2
subsets}: a \textbf{training dataset} and a \textbf{test dataset}. As
the names suggest, the training dataset is used to train the ML
algorithm. The test dataset is then used to check the quality of the
trained ML algorithm (here the \textbf{recognition rate}). For this
purpose, the \textbf{class labels} are \textbf{removed} from the
training dataset - after all, these are to be predicted.

Typically, the \textbf{test dataset} should contain about \textbf{20\%}
of the entire dataset.

In particular, to \textbf{avoid bias} in the sorted Iris dataset due to
splitting, the \textbf{order} of the data rows must be
\textbf{randomized}. This is done with the parameter
\texttt{shuffle=True}.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{83}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{k+kn}{from} \PY{n+nn}{sklearn}\PY{n+nn}{.}\PY{n+nn}{model\PYZus{}selection} \PY{k+kn}{import} \PY{n}{train\PYZus{}test\PYZus{}split}

\PY{n}{X} \PY{o}{=} \PY{n}{irisdata\PYZus{}df}\PY{o}{.}\PY{n}{drop}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{species}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{axis}\PY{o}{=}\PY{l+m+mi}{1}\PY{p}{)}
\PY{n}{y} \PY{o}{=} \PY{n}{irisdata\PYZus{}df}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{species}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}

\PY{n}{X\PYZus{}train}\PY{p}{,} \PY{n}{X\PYZus{}test}\PY{p}{,} \PY{n}{y\PYZus{}train}\PY{p}{,} \PY{n}{y\PYZus{}test} \PY{o}{=} \PY{n}{train\PYZus{}test\PYZus{}split}\PY{p}{(}\PY{n}{X}\PY{p}{,} \PY{n}{y}\PY{p}{,} 
                                                    \PY{n}{test\PYZus{}size} \PY{o}{=} \PY{l+m+mf}{0.20}\PY{p}{,} 
                                                    \PY{n}{shuffle}\PY{o}{=}\PY{k+kc}{True}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    Check that the split datasets are still balanced and that no
\textbf{bias} has been created by the splitting.

For this test, the previously separated labels \texttt{y\_train} must be
added back to the training dataset \texttt{X\_train}.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{84}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Make a deep copy of \PYZsq{}X\PYZus{}train\PYZsq{}}
\PY{n}{X\PYZus{}train\PYZus{}bias\PYZus{}test\PYZus{}df} \PY{o}{=} \PY{n}{X\PYZus{}train}\PY{o}{.}\PY{n}{copy}\PY{p}{(}\PY{n}{deep}\PY{o}{=}\PY{k+kc}{True}\PY{p}{)}

\PY{c+c1}{\PYZsh{} Add list of labels to test dataframe}
\PY{n}{X\PYZus{}train\PYZus{}bias\PYZus{}test\PYZus{}df}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{species}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]} \PY{o}{=} \PY{n}{y\PYZus{}train}

\PY{c+c1}{\PYZsh{} Count unique values without missing values in a column,}
\PY{c+c1}{\PYZsh{} ordered descending and not normalized}
\PY{n}{X\PYZus{}train\PYZus{}bias\PYZus{}test\PYZus{}df}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{species}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}\PY{o}{.}\PY{n}{value\PYZus{}counts}\PY{p}{(}\PY{n}{ascending}\PY{o}{=}\PY{k+kc}{True}\PY{p}{,} 
                                             \PY{n}{dropna}\PY{o}{=}\PY{k+kc}{False}\PY{p}{,} 
                                             \PY{n}{normalize}\PY{o}{=}\PY{k+kc}{False}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

            \begin{tcolorbox}[breakable, size=fbox, boxrule=.5pt, pad at break*=1mm, opacityfill=0]
\prompt{Out}{outcolor}{84}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
Iris-versicolor    37
Iris-virginica     38
Iris-setosa        42
Name: species, dtype: int64
\end{Verbatim}
\end{tcolorbox}
        
    For training, do not use only the variables that correlate best with
each other, but all of them.

Otherwise, the result of the prediction would be significantly worse.
Maybe this is already an indication of \textbf{overfitting} of the ML
model.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{85}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} DO NOT USE THIS!!}
\PY{c+c1}{\PYZsh{}X\PYZus{}train, X\PYZus{}test, y\PYZus{}train, y\PYZus{}test = train\PYZus{}test\PYZus{}split(X[[\PYZsq{}sepal\PYZus{}length\PYZsq{}, }
\PY{c+c1}{\PYZsh{}                                                       \PYZsq{}sepal\PYZus{}width\PYZsq{}]], }
\PY{c+c1}{\PYZsh{}                                                    y, test\PYZus{}size = 0.20)}
\end{Verbatim}
\end{tcolorbox}

    \hypertarget{standardize-the-datasets}{%
\subsection{Standardize the datasets}\label{standardize-the-datasets}}

Standardize the feature values by computing the \textbf{mean},
subtracting the mean from the data points, and then dividing by the
\textbf{standard deviation}:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{86}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{k+kn}{from} \PY{n+nn}{sklearn}\PY{n+nn}{.}\PY{n+nn}{preprocessing} \PY{k+kn}{import} \PY{n}{StandardScaler}

\PY{n}{std\PYZus{}scaler} \PY{o}{=} \PY{n}{StandardScaler}\PY{p}{(}\PY{p}{)}
\PY{n}{X\PYZus{}train} \PY{o}{=} \PY{n}{std\PYZus{}scaler}\PY{o}{.}\PY{n}{fit\PYZus{}transform}\PY{p}{(}\PY{n}{X\PYZus{}train}\PY{p}{)}
\PY{n}{X\PYZus{}test} \PY{o}{=} \PY{n}{std\PYZus{}scaler}\PY{o}{.}\PY{n}{transform}\PY{p}{(}\PY{n}{X\PYZus{}test}\PY{p}{)}

\PY{c+c1}{\PYZsh{}X\PYZus{}train}
\end{Verbatim}
\end{tcolorbox}

    \hypertarget{train-the-svc}{%
\subsection{Train the SVC}\label{train-the-svc}}

In this step the SVC is trained with the training data. Training means
to \textbf{fit} the SVC to the \textbf{training data}.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{87}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} fit the model for the data}
\PY{n}{classifier}\PY{o}{.}\PY{n}{fit}\PY{p}{(}\PY{n}{X\PYZus{}train}\PY{p}{,} \PY{n}{y\PYZus{}train}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

            \begin{tcolorbox}[breakable, size=fbox, boxrule=.5pt, pad at break*=1mm, opacityfill=0]
\prompt{Out}{outcolor}{87}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
SVC(kernel='linear', random\_state=0)
\end{Verbatim}
\end{tcolorbox}
        
    \hypertarget{make-predictions}{%
\subsection{Make predictions}\label{make-predictions}}

In this step the aim is to \textbf{predict the species} using unlabeled
test data.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{88}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{y\PYZus{}pred} \PY{o}{=} \PY{n}{classifier}\PY{o}{.}\PY{n}{predict}\PY{p}{(}\PY{n}{X\PYZus{}test}\PY{p}{)}
\PY{c+c1}{\PYZsh{}X\PYZus{}test}
\PY{c+c1}{\PYZsh{}y\PYZus{}pred}
\end{Verbatim}
\end{tcolorbox}

    \hypertarget{step-6-evaluate-models-performance}{%
\section{STEP 6: Evaluate model's
performance}\label{step-6-evaluate-models-performance}}

Subsequently to the training of the SVC model and the classification
predictions made based on the test data, this step evaluates the
\textbf{quality of the classification result} using known
\textbf{metrics} such as the \textbf{accuracy score}, the
\textbf{cross-validation score} as well as the \textbf{confusion
matrix}.

    \hypertarget{accuracy-score}{%
\subsection{Accuracy Score}\label{accuracy-score}}

In a multilabel classification (such as the Iris dataset), this
\textbf{accuracy classification score} computes the subset accuracy. It
means, that the set of \textbf{labels predicted} in \texttt{y\_pred} for
a sample must exactly match the corresponding set of \textbf{true
labels} in \texttt{y\_test}. For further details see
\href{https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html\#sklearn.metrics.accuracy_score}{sklearn.metrics.accuracy\_score}.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{89}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{k+kn}{from} \PY{n+nn}{sklearn}\PY{n+nn}{.}\PY{n+nn}{metrics} \PY{k+kn}{import} \PY{n}{accuracy\PYZus{}score}

\PY{n}{acc\PYZus{}score} \PY{o}{=} \PY{n}{accuracy\PYZus{}score}\PY{p}{(}\PY{n}{y\PYZus{}test}\PY{p}{,} \PY{n}{y\PYZus{}pred}\PY{p}{)}

\PY{n+nb}{print}\PY{p}{(}\PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{Accuracy score: }\PY{l+s+si}{\PYZob{}:.2f\PYZcb{}}\PY{l+s+s2}{ }\PY{l+s+s2}{\PYZpc{}}\PY{l+s+s2}{\PYZdq{}}\PY{o}{.}\PY{n}{format}\PY{p}{(}\PY{n}{acc\PYZus{}score}\PY{o}{.}\PY{n}{mean}\PY{p}{(}\PY{p}{)}\PY{o}{*}\PY{l+m+mi}{100}\PY{p}{)}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{Verbatim}[commandchars=\\\{\}]
Accuracy score: 90.00 \%
    \end{Verbatim}

    \hypertarget{classification-report}{%
\subsection{Classification Report}\label{classification-report}}

The \textbf{classification report} shows a representation of the most
important \textbf{classification metrics} on a \textbf{class-by-class
basis}. This gives a \textbf{better understanding} of the behaviour of
the classifier than \textbf{global accuracy}, which can mask functional
weaknesses in one class of a multi-class problem (see
\href{https://www.scikit-yb.org/en/latest/api/classifier/classification_report.html}{Classification
Report}).

The metrics are defined in the form of \textbf{true/false positives} and
\textbf{true/false negatives} as follows:

\begin{itemize}
\item
  \textbf{precision:} is the metric for the \textbf{accuracy} of a
  classifier. For each class, it is defined as the \textbf{ratio} of
  \textbf{true positives} to the \textbf{sum of true and false
  positives}.
\item
  \textbf{recall:} is a metric for the \textbf{completeness} of the
  classifier, i.e.~the ability of a classifier to find \textbf{all
  positive instances correctly}. For each class, it is defined as the
  \textbf{ratio} of \textbf{true positives} to the \textbf{sum of true
  positives and false negatives}.
\item
  \textbf{f1-score:} is a \textbf{weighted harmonic mean of precision
  and recall}, with the best value being 1.0 and the worst 0.0.
\item
  \textbf{support:} is the \textbf{number of actual occurrences of the
  class} in the specified data set. This metric is an indication of the
  \textbf{balance of the class distribution} in the test dataset.
\end{itemize}

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{90}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{k+kn}{from} \PY{n+nn}{sklearn}\PY{n+nn}{.}\PY{n+nn}{metrics} \PY{k+kn}{import} \PY{n}{classification\PYZus{}report}

\PY{n+nb}{print}\PY{p}{(}\PY{n}{classification\PYZus{}report}\PY{p}{(}\PY{n}{y\PYZus{}test}\PY{p}{,} \PY{n}{y\PYZus{}pred}\PY{p}{)}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{Verbatim}[commandchars=\\\{\}]
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00         6
Iris-versicolor       0.92      0.85      0.88        13
 Iris-virginica       0.83      0.91      0.87        11

       accuracy                           0.90        30
      macro avg       0.92      0.92      0.92        30
   weighted avg       0.90      0.90      0.90        30

    \end{Verbatim}

    \hypertarget{cross-validation-score}{%
\subsection{Cross-validation score}\label{cross-validation-score}}

The previous evaluations by the \textbf{accuracy classification score}
and the \textbf{classification report} were carried out after a
\textbf{manual classification}. For this purpose, the complete Iris
dataset was first split into a larger \textbf{training dataset} and a
much smaller \textbf{test dataset} with the function
\texttt{train\_test\_split()}. Then the \textbf{SVC model} was
\textbf{trained} with the training dataset and \textbf{validated} with
the test dataset.

For \textbf{automatic classification} with subsequent
\textbf{validation} Scikit-learn provides the function
\texttt{cross\_val\_score()}. This performs \textbf{n-times
cross-validation}. First, the Iris dataset is randomly split into
several different \textbf{subsets} (called \emph{folds}). The number of
subsets is determined by the parameter \texttt{cv} (here
e.g.~\texttt{cv\ =\ 10}). Subsequently, the \textbf{SVC model} is
\textbf{trained} and \textbf{evaluated} several times in succession
according to the number of subsets (here 10 times). In each run, always
a different subset (fold) is used for validation, while training is
performed on the remaining (here nine) folds. The \textbf{result} is an
\textbf{array with ten scores} from the \textbf{evaluation}. (see
\cite{Geron_2018}).

The \textbf{cross-validation} method \textbf{trains} and
\textbf{validates} a model over \textbf{multiple runs} according to the
number of subsets. This leads to a better \textbf{understanding of model
performance} over the \textbf{entire dataset} rather than just a single
train/test split (see \cite{cross_val_score_2022}).

In the following code example, the cross validation is determined over
10 runs (\texttt{cv\ =\ 10}). From the 10 resulting scores in the array
a \textbf{mean} is formed and additionally the \textbf{standard
deviation} is given.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{91}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{k+kn}{from} \PY{n+nn}{sklearn}\PY{n+nn}{.}\PY{n+nn}{model\PYZus{}selection} \PY{k+kn}{import} \PY{n}{cross\PYZus{}val\PYZus{}score}

\PY{n}{irisdata\PYZus{}df\PYZus{}wo\PYZus{}labels} \PY{o}{=} \PY{n}{irisdata\PYZus{}df}\PY{o}{.}\PY{n}{drop}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{species}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{axis}\PY{o}{=}\PY{l+m+mi}{1}\PY{p}{)}
\PY{n}{irisdata\PYZus{}labels} \PY{o}{=} \PY{n}{irisdata\PYZus{}df}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{species}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}

\PY{n}{accuracies} \PY{o}{=} \PY{n}{cross\PYZus{}val\PYZus{}score}\PY{p}{(}\PY{n}{estimator} \PY{o}{=} \PY{n}{classifier}\PY{p}{,} \PY{n}{X} \PY{o}{=} \PY{n}{irisdata\PYZus{}df\PYZus{}wo\PYZus{}labels}\PY{p}{,} 
                             \PY{n}{y} \PY{o}{=} \PY{n}{irisdata\PYZus{}labels}\PY{p}{,} \PY{n}{cv} \PY{o}{=} \PY{l+m+mi}{10}\PY{p}{)}

\PY{n+nb}{print}\PY{p}{(}\PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{Cross\PYZhy{}validation score: }\PY{l+s+si}{\PYZob{}:.2f\PYZcb{}}\PY{l+s+s2}{ }\PY{l+s+s2}{\PYZpc{}}\PY{l+s+s2}{\PYZdq{}}\PY{o}{.}\PY{n}{format}\PY{p}{(}\PY{n}{accuracies}\PY{o}{.}\PY{n}{mean}\PY{p}{(}\PY{p}{)}\PY{o}{*}\PY{l+m+mi}{100}\PY{p}{)}\PY{p}{)}
\PY{n+nb}{print}\PY{p}{(}\PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{Standard Deviation: }\PY{l+s+si}{\PYZob{}:.2f\PYZcb{}}\PY{l+s+s2}{ }\PY{l+s+s2}{\PYZpc{}}\PY{l+s+s2}{\PYZdq{}}\PY{o}{.}\PY{n}{format}\PY{p}{(}\PY{n}{accuracies}\PY{o}{.}\PY{n}{std}\PY{p}{(}\PY{p}{)}\PY{o}{*}\PY{l+m+mi}{100}\PY{p}{)}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{Verbatim}[commandchars=\\\{\}]
Cross-validation score: 97.33 \%
Standard Deviation: 4.42 \%
    \end{Verbatim}

    \hypertarget{confusion-matrix}{%
\subsection{Confusion matrix}\label{confusion-matrix}}

The \textbf{confusion matrix} measures the \textbf{quality of
predictions} from a classification model by looking at how many
\textbf{predictions} are \textbf{True} and how many are \textbf{False}
(see
\href{https://www.jcchouinard.com/confusion-matrix-in-scikit-learn/}{What
the Confusion Matrix Measures?}).

The confusion matrix (also called error matrix) is a special table
layout that visualizes the performance of a classification algorithm.
Each \textbf{row} of the matrix represents the instances in a
\textbf{true class}, while each \textbf{column} represents the instances
in a \textbf{predicted class}, or vice versa - both variants can be
found in the literature. Using this matrix, it is easy to see if there
has been confusion between classes during classification - which is
where the name comes from.

\hypertarget{textual-confusion-matrix}{%
\subsubsection{Textual confusion
matrix}\label{textual-confusion-matrix}}

For checking the accuracy of the model, the \textbf{confusion matrix}
can be used for the \textbf{cross validation}.

By using the function \texttt{sklearn.metrics.confusion\_matrix()} a
\textbf{confusion matrix} of the \textbf{true Iris class labels} versus
the \textbf{predicted class labels} is plotted.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{92}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{cm} \PY{o}{=} \PY{n}{metrics}\PY{o}{.}\PY{n}{confusion\PYZus{}matrix}\PY{p}{(}\PY{n}{y\PYZus{}test}\PY{p}{,} \PY{n}{y\PYZus{}pred}\PY{p}{)}
\PY{n+nb}{print}\PY{p}{(}\PY{n}{cm}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{Verbatim}[commandchars=\\\{\}]
[[ 6  0  0]
 [ 0 11  2]
 [ 0  1 10]]
    \end{Verbatim}

    \hypertarget{colored-confusion-matrix}{%
\subsubsection{Colored confusion
matrix}\label{colored-confusion-matrix}}

The function \texttt{sklearn.metrics.ConfusionMatrixDisplay()} plots a
colored confusion matrix.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{93}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{sns}\PY{o}{.}\PY{n}{set\PYZus{}style}\PY{p}{(}\PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{white}\PY{l+s+s2}{\PYZdq{}}\PY{p}{)}

\PY{c+c1}{\PYZsh{} print colored confusion matrix}
\PY{n}{cm\PYZus{}colored} \PY{o}{=} \PY{n}{metrics}\PY{o}{.}\PY{n}{ConfusionMatrixDisplay}\PY{o}{.}\PY{n}{from\PYZus{}predictions}\PY{p}{(}\PY{n}{y\PYZus{}test}\PY{p}{,} \PY{n}{y\PYZus{}pred}\PY{p}{)}

\PY{n}{cm\PYZus{}colored}\PY{o}{.}\PY{n}{figure\PYZus{}}\PY{o}{.}\PY{n}{set\PYZus{}figwidth}\PY{p}{(}\PY{l+m+mi}{8}\PY{p}{)}
\PY{n}{cm\PYZus{}colored}\PY{o}{.}\PY{n}{figure\PYZus{}}\PY{o}{.}\PY{n}{set\PYZus{}figheight}\PY{p}{(}\PY{l+m+mi}{7}\PY{p}{)}

\PY{n}{cm\PYZus{}colored}\PY{o}{.}\PY{n}{confusion\PYZus{}matrix}

\PY{c+c1}{\PYZsh{} y .. padding between title and plot}
\PY{n}{plt}\PY{o}{.}\PY{n}{title}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Colored Confusion Matrix}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{y}\PY{o}{=}\PY{l+m+mf}{1.1}\PY{p}{)}

\PY{c+c1}{\PYZsh{} save figure as PNG}
\PY{c+c1}{\PYZsh{}plt.tight\PYZus{}layout()}
\PY{c+c1}{\PYZsh{}plt.savefig(\PYZsq{}images/confusion\PYZus{}matrix.png\PYZsq{}, dpi=150, pad\PYZus{}inches=5)}
\PY{n}{plt}\PY{o}{.}\PY{n}{show}\PY{p}{(}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{figure}
        \begin{center}\includegraphics[scale=0.6]{Step-by-step_intro_to_ML_with_SVC_and_Iris_files/Step-by-step_intro_to_ML_with_SVC_and_Iris_233_0.png}\end{center}
        \caption{Checking the accuracy of the model by using the confusion matrix for cross-validation}
        \label{fig:confusion_matrix}
    \end{figure}
    
    \hypertarget{step-7-vary-parameters-of-the-ml-model-manually}{%
\section{STEP 7: Vary parameters of the ML model
manually}\label{step-7-vary-parameters-of-the-ml-model-manually}}

This section was inspired by
\href{https://medium.com/all-things-ai/in-depth-parameter-tuning-for-svc-758215394769}{In
Depth: Parameter tuning for SVC}

In this section, the 4 SVC parameters \texttt{kernel}, \texttt{gamma},
\texttt{C} and \texttt{degree} will be introduced one by one.
Furthermore, their influence on the classification result by varying
these single parameters will be shown.

\textbf{Disclaimer:} In order to show the effects of varying the
individual parameters in 2D graphs, only the best correlating variables
\texttt{petal\_length} and \texttt{petal\_width} are used to train the
SVC.

\hypertarget{prepare-dataset}{%
\subsection{Prepare dataset}\label{prepare-dataset}}

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{94}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Import packages}
\PY{k+kn}{from} \PY{n+nn}{sklearn}\PY{n+nn}{.}\PY{n+nn}{svm} \PY{k+kn}{import} \PY{n}{SVC}
\PY{c+c1}{\PYZsh{}from sklearn.preprocessing import StandardScaler}
\PY{k+kn}{from} \PY{n+nn}{sklearn}\PY{n+nn}{.}\PY{n+nn}{preprocessing} \PY{k+kn}{import} \PY{n}{MinMaxScaler}
\PY{k+kn}{from} \PY{n+nn}{sklearn}\PY{n+nn}{.}\PY{n+nn}{model\PYZus{}selection} \PY{k+kn}{import} \PY{n}{train\PYZus{}test\PYZus{}split}
\PY{k+kn}{from} \PY{n+nn}{sklearn}\PY{n+nn}{.}\PY{n+nn}{model\PYZus{}selection} \PY{k+kn}{import} \PY{n}{cross\PYZus{}val\PYZus{}score}
\PY{k+kn}{import} \PY{n+nn}{numpy} \PY{k}{as} \PY{n+nn}{np}
\end{Verbatim}
\end{tcolorbox}

    To avoid errors, the Iris dataset is imported again:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{95}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Import ORIGINAL Iris dataset for classification}
\PY{c+c1}{\PYZsh{}irisdata\PYZus{}df = pd.read\PYZus{}csv(\PYZsq{}./datasets/IRIS\PYZus{}flower\PYZus{}dataset\PYZus{}kaggle.csv\PYZsq{})}

\PY{c+c1}{\PYZsh{} Import NOISED Iris dataset for classification}
\PY{n}{irisdata\PYZus{}df} \PY{o}{=} \PY{n}{pd}\PY{o}{.}\PY{n}{read\PYZus{}csv}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{./datasets/IRIS\PYZus{}flower\PYZus{}dataset\PYZus{}kaggle\PYZus{}noised.csv}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    Encode the class column from class strings to integer equivalents:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{96}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{irisdata\PYZus{}df\PYZus{}enc} \PY{o}{=} \PY{n}{irisdata\PYZus{}df}\PY{o}{.}\PY{n}{replace}\PY{p}{(}\PY{p}{\PYZob{}}\PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{species}\PY{l+s+s2}{\PYZdq{}}\PY{p}{:}  \PY{p}{\PYZob{}}\PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{Iris\PYZhy{}setosa}\PY{l+s+s2}{\PYZdq{}}\PY{p}{:}\PY{l+m+mi}{0}\PY{p}{,}
                                                    \PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{Iris\PYZhy{}versicolor}\PY{l+s+s2}{\PYZdq{}}\PY{p}{:}\PY{l+m+mi}{1}\PY{p}{,}
                                                    \PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{Iris\PYZhy{}virginica}\PY{l+s+s2}{\PYZdq{}}\PY{p}{:}\PY{l+m+mi}{2}\PY{p}{\PYZcb{}}\PY{p}{\PYZcb{}}\PY{p}{)}
\PY{c+c1}{\PYZsh{}irisdata\PYZus{}df\PYZus{}enc}
\end{Verbatim}
\end{tcolorbox}

    \hypertarget{prepare-datasets-for-parameter-variation-and-plotting}{%
\subsubsection{Prepare datasets for parameter variation and
plotting}\label{prepare-datasets-for-parameter-variation-and-plotting}}

These datasets will be used for parameter variation and plotting only.
In particular, for later \textbf{2D plotting} of the effects of
parameter variation, only \textbf{2 variables} of the Iris dataset can
be used.

However, as seen in the previous section, this selection is very much at
the expense of detection accuracy. Therefore, it is not useful to make
predictions with this subset of data - it is not necessary to divide it
into a training and a test dataset.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{97}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Copy only 2 feature columns}
\PY{c+c1}{\PYZsh{} and convert pandas.DataFrame to numpy array}
\PY{n}{X\PYZus{}plot} \PY{o}{=} \PY{n}{irisdata\PYZus{}df\PYZus{}enc}\PY{p}{[}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{petal\PYZus{}length}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{petal\PYZus{}width}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}\PY{p}{]}\PY{o}{.}\PY{n}{to\PYZus{}numpy}\PY{p}{(}\PY{n}{copy}\PY{o}{=}\PY{k+kc}{True}\PY{p}{)}
\PY{c+c1}{\PYZsh{}X\PYZus{}plot = irisdata\PYZus{}df\PYZus{}enc[[\PYZsq{}sepal\PYZus{}length\PYZsq{}, \PYZsq{}sepal\PYZus{}width\PYZsq{}]].to\PYZus{}numpy(copy=True)}
\PY{c+c1}{\PYZsh{}X\PYZus{}plot}
\end{Verbatim}
\end{tcolorbox}

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{98}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Convert pandas.DataFrame to numpy array}
\PY{c+c1}{\PYZsh{} and get a flat 1D copy of 2D numpy array}
\PY{n}{y\PYZus{}plot} \PY{o}{=} \PY{n}{irisdata\PYZus{}df\PYZus{}enc}\PY{p}{[}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{species}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}\PY{p}{]}\PY{o}{.}\PY{n}{to\PYZus{}numpy}\PY{p}{(}\PY{n}{copy}\PY{o}{=}\PY{k+kc}{True}\PY{p}{)}\PY{o}{.}\PY{n}{flatten}\PY{p}{(}\PY{p}{)}
\PY{c+c1}{\PYZsh{}y\PYZus{}plot}
\end{Verbatim}
\end{tcolorbox}

    \hypertarget{prepare-dataset-for-prediction-and-evaluation}{%
\subsubsection{Prepare dataset for prediction and
evaluation}\label{prepare-dataset-for-prediction-and-evaluation}}

To \textbf{evaluate the recognition accuracy} by parameter variation,
the complete Iris dataset with all variables must be used. To make
predictions with test data, the dataset is again divided into a training
and a test dataset.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{99}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{X} \PY{o}{=} \PY{n}{irisdata\PYZus{}df}\PY{o}{.}\PY{n}{drop}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{species}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{axis}\PY{o}{=}\PY{l+m+mi}{1}\PY{p}{)}
\PY{n}{y} \PY{o}{=} \PY{n}{irisdata\PYZus{}df}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{species}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}

\PY{n}{X\PYZus{}train}\PY{p}{,} \PY{n}{X\PYZus{}test}\PY{p}{,} \PY{n}{y\PYZus{}train}\PY{p}{,} \PY{n}{y\PYZus{}test} \PY{o}{=} \PY{n}{train\PYZus{}test\PYZus{}split}\PY{p}{(}\PY{n}{X}\PY{p}{,} \PY{n}{y}\PY{p}{,} 
                                                    \PY{n}{test\PYZus{}size} \PY{o}{=} \PY{l+m+mf}{0.20}\PY{p}{,} 
                                                    \PY{n}{shuffle}\PY{o}{=}\PY{k+kc}{True}\PY{p}{)}
\PY{c+c1}{\PYZsh{}X\PYZus{}train}
\end{Verbatim}
\end{tcolorbox}

    \textbf{Normalize} the feature values for \textbf{prediction and
evaluation}. Normalization is deliberately used here to avoid
visualization problems due to negative values by standardization.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{100}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{norm\PYZus{}scaler\PYZus{}pred} \PY{o}{=} \PY{n}{MinMaxScaler}\PY{p}{(}\PY{p}{)}
\PY{n}{X\PYZus{}train} \PY{o}{=} \PY{n}{norm\PYZus{}scaler\PYZus{}pred}\PY{o}{.}\PY{n}{fit\PYZus{}transform}\PY{p}{(}\PY{n}{X\PYZus{}train}\PY{p}{)}
\PY{n}{X\PYZus{}test} \PY{o}{=} \PY{n}{norm\PYZus{}scaler\PYZus{}pred}\PY{o}{.}\PY{n}{transform}\PY{p}{(}\PY{n}{X\PYZus{}test}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \hypertarget{plotting-functions}{%
\subsection{Plotting functions}\label{plotting-functions}}

This function helps to \textbf{visualize} the modifications by
\textbf{varying} the individual \textbf{SVC parameters}:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{101}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{k}{def} \PY{n+nf}{plotSVC}\PY{p}{(}\PY{n}{title}\PY{p}{,} \PY{n}{svc}\PY{p}{,} \PY{n}{X}\PY{p}{,} \PY{n}{y}\PY{p}{,} \PY{n}{xlabel}\PY{p}{,} \PY{n}{ylabel}\PY{p}{,} \PY{n}{subplot}\PY{p}{)}\PY{p}{:}
    \PY{c+c1}{\PYZsh{} create a mesh to plot in}
    \PY{n}{x\PYZus{}min}\PY{p}{,} \PY{n}{x\PYZus{}max} \PY{o}{=} \PY{n}{X}\PY{p}{[}\PY{p}{:}\PY{p}{,} \PY{l+m+mi}{0}\PY{p}{]}\PY{o}{.}\PY{n}{min}\PY{p}{(}\PY{p}{)} \PY{o}{\PYZhy{}} \PY{l+m+mi}{1}\PY{p}{,} \PY{n}{X}\PY{p}{[}\PY{p}{:}\PY{p}{,} \PY{l+m+mi}{0}\PY{p}{]}\PY{o}{.}\PY{n}{max}\PY{p}{(}\PY{p}{)} \PY{o}{+} \PY{l+m+mi}{1}
    \PY{n}{y\PYZus{}min}\PY{p}{,} \PY{n}{y\PYZus{}max} \PY{o}{=} \PY{n}{X}\PY{p}{[}\PY{p}{:}\PY{p}{,} \PY{l+m+mi}{1}\PY{p}{]}\PY{o}{.}\PY{n}{min}\PY{p}{(}\PY{p}{)} \PY{o}{\PYZhy{}} \PY{l+m+mi}{1}\PY{p}{,} \PY{n}{X}\PY{p}{[}\PY{p}{:}\PY{p}{,} \PY{l+m+mi}{1}\PY{p}{]}\PY{o}{.}\PY{n}{max}\PY{p}{(}\PY{p}{)} \PY{o}{+} \PY{l+m+mi}{1}
    
    \PY{c+c1}{\PYZsh{} prevent division by zero}
    \PY{k}{if} \PY{n}{x\PYZus{}min} \PY{o}{==} \PY{l+m+mf}{0.0}\PY{p}{:}
        \PY{n}{x\PYZus{}min} \PY{o}{=} \PY{l+m+mf}{0.1}
    
    \PY{n}{h} \PY{o}{=} \PY{p}{(}\PY{n}{x\PYZus{}max} \PY{o}{/} \PY{n}{x\PYZus{}min}\PY{p}{)}\PY{o}{/}\PY{l+m+mi}{1000}
    \PY{n}{xx}\PY{p}{,} \PY{n}{yy} \PY{o}{=} \PY{n}{np}\PY{o}{.}\PY{n}{meshgrid}\PY{p}{(}\PY{n}{np}\PY{o}{.}\PY{n}{arange}\PY{p}{(}\PY{n}{x\PYZus{}min}\PY{p}{,} \PY{n}{x\PYZus{}max}\PY{p}{,} \PY{n}{h}\PY{p}{)}\PY{p}{,} \PY{n}{np}\PY{o}{.}\PY{n}{arange}\PY{p}{(}\PY{n}{y\PYZus{}min}\PY{p}{,} \PY{n}{y\PYZus{}max}\PY{p}{,} \PY{n}{h}\PY{p}{)}\PY{p}{)}
    
    \PY{n}{Z} \PY{o}{=} \PY{n}{svc}\PY{o}{.}\PY{n}{predict}\PY{p}{(}\PY{n}{np}\PY{o}{.}\PY{n}{c\PYZus{}}\PY{p}{[}\PY{n}{xx}\PY{o}{.}\PY{n}{ravel}\PY{p}{(}\PY{p}{)}\PY{p}{,} \PY{n}{yy}\PY{o}{.}\PY{n}{ravel}\PY{p}{(}\PY{p}{)}\PY{p}{]}\PY{p}{)}
    \PY{n}{Z} \PY{o}{=} \PY{n}{Z}\PY{o}{.}\PY{n}{reshape}\PY{p}{(}\PY{n}{xx}\PY{o}{.}\PY{n}{shape}\PY{p}{)}
    
    \PY{n}{ax} \PY{o}{=} \PY{n}{subplot}
    
    \PY{n}{ax}\PY{o}{.}\PY{n}{contourf}\PY{p}{(}\PY{n}{xx}\PY{p}{,} \PY{n}{yy}\PY{p}{,} \PY{n}{Z}\PY{p}{,} \PY{n}{cmap}\PY{o}{=}\PY{n}{plt}\PY{o}{.}\PY{n}{cm}\PY{o}{.}\PY{n}{Paired}\PY{p}{,} \PY{n}{alpha}\PY{o}{=}\PY{l+m+mf}{0.6}\PY{p}{)}
    \PY{n}{ax}\PY{o}{.}\PY{n}{scatter}\PY{p}{(}\PY{n}{X}\PY{p}{[}\PY{p}{:}\PY{p}{,} \PY{l+m+mi}{0}\PY{p}{]}\PY{p}{,} \PY{n}{X}\PY{p}{[}\PY{p}{:}\PY{p}{,} \PY{l+m+mi}{1}\PY{p}{]}\PY{p}{,} \PY{n}{c}\PY{o}{=}\PY{n}{y}\PY{p}{,} \PY{n}{cmap}\PY{o}{=}\PY{n}{plt}\PY{o}{.}\PY{n}{cm}\PY{o}{.}\PY{n}{Paired}\PY{p}{)}
    \PY{n}{ax}\PY{o}{.}\PY{n}{set\PYZus{}xlabel}\PY{p}{(}\PY{n}{xlabel}\PY{p}{)}
    \PY{n}{ax}\PY{o}{.}\PY{n}{set\PYZus{}ylabel}\PY{p}{(}\PY{n}{ylabel}\PY{p}{)}
    \PY{n}{ax}\PY{o}{.}\PY{n}{set\PYZus{}xlim}\PY{p}{(}\PY{n}{xx}\PY{o}{.}\PY{n}{min}\PY{p}{(}\PY{p}{)}\PY{p}{,} \PY{n}{xx}\PY{o}{.}\PY{n}{max}\PY{p}{(}\PY{p}{)}\PY{p}{)}
    \PY{c+c1}{\PYZsh{} Set the title of the diagram}
    \PY{c+c1}{\PYZsh{} pad ... defines the distance of the title from the top of the diagram}
    \PY{n}{ax}\PY{o}{.}\PY{n}{set\PYZus{}title}\PY{p}{(}\PY{n}{title}\PY{p}{,} \PY{n}{pad}\PY{o}{=}\PY{l+m+mi}{10}\PY{p}{)}
    \PY{n}{ax}\PY{o}{.}\PY{n}{grid}\PY{p}{(}\PY{n}{visible}\PY{o}{=}\PY{k+kc}{False}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    This function cares for \textbf{cross validation}:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{102}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{k}{def} \PY{n+nf}{crossValSVC}\PY{p}{(}\PY{n}{X\PYZus{}train}\PY{p}{,} \PY{n}{y\PYZus{}train}\PY{p}{,} \PY{n}{kernel}\PY{o}{=}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{rbf}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{gamma}\PY{o}{=}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{scale}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{C}\PY{o}{=}\PY{l+m+mf}{1.0}\PY{p}{,} \PY{n}{degree}\PY{o}{=}\PY{l+m+mi}{3}\PY{p}{)}\PY{p}{:}
    \PY{c+c1}{\PYZsh{} train the SVC}
    \PY{n}{svc} \PY{o}{=} \PY{n}{svm}\PY{o}{.}\PY{n}{SVC}\PY{p}{(}\PY{n}{kernel}\PY{o}{=}\PY{n}{kernel}\PY{p}{,} 
                  \PY{n}{gamma}\PY{o}{=}\PY{n}{gamma}\PY{p}{,} 
                  \PY{n}{C}\PY{o}{=}\PY{n}{C}\PY{p}{,} 
                  \PY{n}{degree}\PY{o}{=}\PY{n}{degree}\PY{p}{)}\PY{o}{.}\PY{n}{fit}\PY{p}{(}\PY{n}{X\PYZus{}train}\PY{p}{,} \PY{n}{y\PYZus{}train}\PY{p}{)}
    \PY{c+c1}{\PYZsh{} calculate accuracies}
    \PY{n}{accuracies} \PY{o}{=} \PY{n}{cross\PYZus{}val\PYZus{}score}\PY{p}{(}\PY{n}{estimator} \PY{o}{=} \PY{n}{svc}\PY{p}{,} \PY{n}{X} \PY{o}{=} \PY{n}{X\PYZus{}train}\PY{p}{,} 
                                 \PY{n}{y} \PY{o}{=} \PY{n}{y\PYZus{}train}\PY{p}{,} \PY{n}{cv} \PY{o}{=} \PY{l+m+mi}{10}\PY{p}{)}
    
    \PY{n}{accuracy} \PY{o}{=} \PY{n}{accuracies}\PY{o}{.}\PY{n}{mean}\PY{p}{(}\PY{p}{)}\PY{o}{*}\PY{l+m+mi}{100}
    \PY{k}{return} \PY{n}{accuracy}
\end{Verbatim}
\end{tcolorbox}

    This function plots the variation of the SVC parameters against the
prediction accuracy to show the \textbf{effect of variation} and its
\textbf{limits regarding} the phenomenon \textbf{overfitting}:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{103}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{k}{def} \PY{n+nf}{plotParamsAcc}\PY{p}{(}\PY{n}{param\PYZus{}list}\PY{p}{,} \PY{n}{acc\PYZus{}list}\PY{p}{,} \PY{n}{param\PYZus{}name}\PY{p}{,} \PY{n}{log\PYZus{}scale}\PY{o}{=}\PY{k+kc}{False}\PY{p}{)}\PY{p}{:}
    \PY{n}{fig}\PY{p}{,} \PY{n}{ax} \PY{o}{=} \PY{n}{plt}\PY{o}{.}\PY{n}{subplots}\PY{p}{(}\PY{n}{figsize}\PY{o}{=}\PY{p}{(}\PY{l+m+mi}{12}\PY{p}{,} \PY{l+m+mi}{4}\PY{p}{)}\PY{p}{)}
    \PY{n}{title\PYZus{}str} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Variation of }\PY{l+s+si}{\PYZob{}\PYZcb{}}\PY{l+s+s1}{ parameter }\PY{l+s+s1}{\PYZsq{}}\PY{o}{.}\PY{n}{format}\PY{p}{(}\PY{n}{param\PYZus{}name}\PY{p}{)} \PYZbs{}
                \PY{o}{+}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{and its effect to prediction accuracy}\PY{l+s+s1}{\PYZsq{}}
    \PY{c+c1}{\PYZsh{} Set the title of the diagram}
    \PY{c+c1}{\PYZsh{} pad ... defines the distance of the title from the top of the diagram}
    \PY{n}{plt}\PY{o}{.}\PY{n}{title}\PY{p}{(}\PY{n}{title\PYZus{}str}\PY{p}{,} \PY{n}{pad}\PY{o}{=}\PY{l+m+mi}{10}\PY{p}{)}
    \PY{n}{ax}\PY{o}{.}\PY{n}{plot}\PY{p}{(}\PY{n}{param\PYZus{}list}\PY{p}{,} \PY{n}{accuracy\PYZus{}list}\PY{p}{)}
    \PY{k}{if} \PY{n}{log\PYZus{}scale}\PY{p}{:}
        \PY{c+c1}{\PYZsh{} set the X axis scale to logarithmic}
        \PY{n}{ax}\PY{o}{.}\PY{n}{set\PYZus{}xscale}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{log}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}
    \PY{n}{plt}\PY{o}{.}\PY{n}{xlabel}\PY{p}{(}\PY{n}{param\PYZus{}name}\PY{p}{)}
    \PY{n}{plt}\PY{o}{.}\PY{n}{ylabel}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{accuracy [}\PY{l+s+s1}{\PYZpc{}}\PY{l+s+s1}{]}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}
    \PY{n}{plt}\PY{o}{.}\PY{n}{grid}\PY{p}{(}\PY{n}{visible}\PY{o}{=}\PY{k+kc}{True}\PY{p}{)}
    \PY{n}{plt}\PY{o}{.}\PY{n}{show}\PY{p}{(}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \hypertarget{vary-kernel-of-svc}{%
\subsection{\texorpdfstring{Vary \texttt{kernel} of
SVC}{Vary kernel of SVC}}\label{vary-kernel-of-svc}}

The \texttt{kernel} parameter selects the type of hyperplane that is
used to separate the data. Using \texttt{linear}
(\href{https://en.wikipedia.org/wiki/Linear_classifier}{linear
classifier}) kernel will use a linear hyperplane (a line in the case of
2D data). The \texttt{rbf}
(\href{https://en.wikipedia.org/wiki/Radial_basis_function_kernel}{radial
basis function kernel}) and \texttt{poly}
(\href{https://en.wikipedia.org/wiki/Polynomial_kernel}{polynomial
kernel}) kernel use non linear hyperplanes. The \textbf{default} is
\texttt{kernel=rbf}.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{104}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{kernels} \PY{o}{=} \PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{linear}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{rbf}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{poly}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{sigmoid}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}

\PY{n}{xlabel} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Petal length}\PY{l+s+s1}{\PYZsq{}}
\PY{n}{ylabel} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Petal width}\PY{l+s+s1}{\PYZsq{}}

\PY{c+c1}{\PYZsh{} Setup 2x2 grid for plotting}
\PY{n}{fig}\PY{p}{,} \PY{n}{subplots} \PY{o}{=} \PY{n}{plt}\PY{o}{.}\PY{n}{subplots}\PY{p}{(}\PY{l+m+mi}{2}\PY{p}{,} \PY{l+m+mi}{2}\PY{p}{,} \PY{n}{figsize}\PY{o}{=}\PY{p}{(}\PY{l+m+mi}{14}\PY{p}{,} \PY{l+m+mi}{10}\PY{p}{)}\PY{p}{)}
\PY{c+c1}{\PYZsh{} Set margins between subplots}
\PY{n}{plt}\PY{o}{.}\PY{n}{subplots\PYZus{}adjust}\PY{p}{(}\PY{n}{wspace}\PY{o}{=}\PY{l+m+mf}{0.3}\PY{p}{,} \PY{n}{hspace}\PY{o}{=}\PY{l+m+mf}{0.3}\PY{p}{)}

\PY{c+c1}{\PYZsh{} Make subplots iterable via \PYZsq{}subplots.flatten()\PYZsq{}}
\PY{k}{for} \PY{n}{kernel}\PY{p}{,} \PY{n}{subplot} \PY{o+ow}{in} \PY{n+nb}{zip}\PY{p}{(}\PY{n}{kernels}\PY{p}{,} \PY{n}{subplots}\PY{o}{.}\PY{n}{flatten}\PY{p}{(}\PY{p}{)}\PY{p}{)}\PY{p}{:}
    \PY{n}{svc\PYZus{}plot} \PY{o}{=} \PY{n}{svm}\PY{o}{.}\PY{n}{SVC}\PY{p}{(}\PY{n}{kernel}\PY{o}{=}\PY{n}{kernel}\PY{p}{)}\PY{o}{.}\PY{n}{fit}\PY{p}{(}\PY{n}{X\PYZus{}plot}\PY{p}{,} \PY{n}{y\PYZus{}plot}\PY{p}{)}
    \PY{n}{accuracy} \PY{o}{=} \PY{n}{crossValSVC}\PY{p}{(}\PY{n}{X\PYZus{}train}\PY{p}{,} \PY{n}{y\PYZus{}train}\PY{p}{,} \PY{n}{kernel}\PY{o}{=}\PY{n}{kernel}\PY{p}{)}
    \PY{n}{title\PYZus{}str} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{kernel: }\PY{l+s+se}{\PYZbs{}\PYZsq{}}\PY{l+s+s1}{\PYZsq{}}\PY{o}{+}\PY{n+nb}{str}\PY{p}{(}\PY{n}{kernel}\PY{p}{)}\PY{o}{+}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+se}{\PYZbs{}\PYZsq{}}\PY{l+s+s1}{, }\PY{l+s+s1}{\PYZsq{}} \PYZbs{}
                \PY{o}{+}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Acc. prediction: }\PY{l+s+si}{\PYZob{}:.2f\PYZcb{}}\PY{l+s+s1}{\PYZpc{}}\PY{l+s+s1}{\PYZsq{}}\PY{o}{.}\PY{n}{format}\PY{p}{(}\PY{n}{accuracy}\PY{p}{)}
    \PY{n}{plotSVC}\PY{p}{(}\PY{n}{title\PYZus{}str}\PY{p}{,} \PY{n}{svc\PYZus{}plot}\PY{p}{,} \PY{n}{X\PYZus{}plot}\PY{p}{,} \PY{n}{y\PYZus{}plot}\PY{p}{,} \PY{n}{xlabel}\PY{p}{,} \PY{n}{ylabel}\PY{p}{,} \PY{n}{subplot}\PY{p}{)}

\PY{n}{plt}\PY{o}{.}\PY{n}{show}\PY{p}{(}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{figure}
        \begin{center}\adjustimage{max size={0.9\linewidth}{0.9\paperheight}}{Step-by-step_intro_to_ML_with_SVC_and_Iris_files/Step-by-step_intro_to_ML_with_SVC_and_Iris_254_0.png}\end{center}
        \caption{This group of images shows the effect on the classification by the choice of the different SVC kernels ('linear', 'rbf', 'poly' and 'sigmoid')}
        \label{fig:vary_kernels}
    \end{figure}
    
    \hypertarget{vary-gamma-parameter}{%
\subsection{\texorpdfstring{Vary \texttt{gamma}
parameter}{Vary gamma parameter}}\label{vary-gamma-parameter}}

The \texttt{gamma} parameter is used for \textbf{non linear
hyperplanes}. The higher the \texttt{gamma} float value it tries to
exactly fit the training dataset. The \textbf{default} is
\texttt{gamma=\textquotesingle{}scale\textquotesingle{}}.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{105}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{gammas} \PY{o}{=} \PY{p}{[}\PY{l+m+mf}{0.1}\PY{p}{,} \PY{l+m+mf}{0.3}\PY{p}{,} \PY{l+m+mf}{0.5}\PY{p}{,} \PY{l+m+mi}{1}\PY{p}{,} \PY{l+m+mi}{10}\PY{p}{,} \PY{l+m+mi}{100}\PY{p}{]}

\PY{n}{xlabel} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Petal length}\PY{l+s+s1}{\PYZsq{}}
\PY{n}{ylabel} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Petal width}\PY{l+s+s1}{\PYZsq{}}

\PY{c+c1}{\PYZsh{} Setup 2x3 grid for plotting}
\PY{n}{fig}\PY{p}{,} \PY{n}{subplots} \PY{o}{=} \PY{n}{plt}\PY{o}{.}\PY{n}{subplots}\PY{p}{(}\PY{l+m+mi}{3}\PY{p}{,} \PY{l+m+mi}{2}\PY{p}{,} \PY{n}{figsize}\PY{o}{=}\PY{p}{(}\PY{l+m+mi}{14}\PY{p}{,} \PY{l+m+mi}{15}\PY{p}{)}\PY{p}{)}
\PY{c+c1}{\PYZsh{} Set margins between subplots}
\PY{n}{plt}\PY{o}{.}\PY{n}{subplots\PYZus{}adjust}\PY{p}{(}\PY{n}{wspace}\PY{o}{=}\PY{l+m+mf}{0.3}\PY{p}{,} \PY{n}{hspace}\PY{o}{=}\PY{l+m+mf}{0.3}\PY{p}{)}

\PY{c+c1}{\PYZsh{} Make subplots iterable via \PYZsq{}subplots.flatten()\PYZsq{}}
\PY{k}{for} \PY{n}{gamma}\PY{p}{,} \PY{n}{subplot} \PY{o+ow}{in} \PY{n+nb}{zip}\PY{p}{(}\PY{n}{gammas}\PY{p}{,} \PY{n}{subplots}\PY{o}{.}\PY{n}{flatten}\PY{p}{(}\PY{p}{)}\PY{p}{)}\PY{p}{:}
    \PY{n}{svc\PYZus{}plot} \PY{o}{=} \PY{n}{svm}\PY{o}{.}\PY{n}{SVC}\PY{p}{(}\PY{n}{kernel}\PY{o}{=}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{rbf}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{gamma}\PY{o}{=}\PY{n}{gamma}\PY{p}{)}\PY{o}{.}\PY{n}{fit}\PY{p}{(}\PY{n}{X\PYZus{}plot}\PY{p}{,} \PY{n}{y\PYZus{}plot}\PY{p}{)}
    \PY{n}{accuracy} \PY{o}{=} \PY{n}{crossValSVC}\PY{p}{(}\PY{n}{X\PYZus{}train}\PY{p}{,} \PY{n}{y\PYZus{}train}\PY{p}{,} \PY{n}{kernel}\PY{o}{=}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{rbf}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{gamma}\PY{o}{=}\PY{n}{gamma}\PY{p}{)}
    \PY{n}{title\PYZus{}str} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{gamma: }\PY{l+s+se}{\PYZbs{}\PYZsq{}}\PY{l+s+s1}{\PYZsq{}}\PY{o}{+}\PY{n+nb}{str}\PY{p}{(}\PY{n}{gamma}\PY{p}{)}\PY{o}{+}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+se}{\PYZbs{}\PYZsq{}}\PY{l+s+s1}{, }\PY{l+s+s1}{\PYZsq{}} \PYZbs{}
                \PY{o}{+}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Acc. prediction: }\PY{l+s+si}{\PYZob{}:.2f\PYZcb{}}\PY{l+s+s1}{\PYZpc{}}\PY{l+s+s1}{\PYZsq{}}\PY{o}{.}\PY{n}{format}\PY{p}{(}\PY{n}{accuracy}\PY{p}{)}
    \PY{n}{plotSVC}\PY{p}{(}\PY{n}{title\PYZus{}str}\PY{p}{,} \PY{n}{svc\PYZus{}plot}\PY{p}{,} \PY{n}{X\PYZus{}plot}\PY{p}{,} \PY{n}{y\PYZus{}plot}\PY{p}{,} \PY{n}{xlabel}\PY{p}{,} \PY{n}{ylabel}\PY{p}{,} \PY{n}{subplot}\PY{p}{)}

\PY{n}{plt}\PY{o}{.}\PY{n}{show}\PY{p}{(}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{figure}
        \begin{center}\adjustimage{max size={0.9\linewidth}{0.9\paperheight}}{Step-by-step_intro_to_ML_with_SVC_and_Iris_files/Step-by-step_intro_to_ML_with_SVC_and_Iris_256_0.png}\end{center}
        \caption{This group of images shows the effect on the classification by the variation of the parameter 'gamma' of the 'rbf' kernel}
        \label{fig:vary_gamma_parameter}
    \end{figure}
    
    Show the variation of the SVC parameter \texttt{gamma} against the
\textbf{prediction accuracy}.

As we can see, increasing \texttt{gamma} leads to \textbf{overfitting}
as the classifier tries to perfectly fit the training data.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{106}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{gammas} \PY{o}{=} \PY{p}{[}\PY{l+m+mf}{0.1}\PY{p}{,} \PY{l+m+mf}{0.2}\PY{p}{,} \PY{l+m+mf}{0.3}\PY{p}{,} \PY{l+m+mf}{0.4}\PY{p}{,} \PY{l+m+mf}{0.5}\PY{p}{,} \PY{l+m+mf}{0.6}\PY{p}{,} \PY{l+m+mf}{0.7}\PY{p}{,} \PY{l+m+mf}{0.8}\PY{p}{,} \PY{l+m+mf}{0.9}\PY{p}{,} \PY{l+m+mi}{1}\PY{p}{,} \PY{l+m+mi}{10}\PY{p}{,} \PY{l+m+mi}{100}\PY{p}{,} \PY{l+m+mi}{200}\PY{p}{]}

\PY{n}{accuracy\PYZus{}list} \PY{o}{=} \PY{n+nb}{list}\PY{p}{(}\PY{p}{)}
\PY{k}{for} \PY{n}{gamma} \PY{o+ow}{in} \PY{n}{gammas}\PY{p}{:}
    \PY{n}{accuracy} \PY{o}{=} \PY{n}{crossValSVC}\PY{p}{(}\PY{n}{X\PYZus{}train}\PY{p}{,} \PY{n}{y\PYZus{}train}\PY{p}{,} \PY{n}{kernel}\PY{o}{=}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{rbf}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{gamma}\PY{o}{=}\PY{n}{gamma}\PY{p}{)}
    \PY{n}{accuracy\PYZus{}list}\PY{o}{.}\PY{n}{append}\PY{p}{(}\PY{n}{accuracy}\PY{p}{)}

\PY{n}{plotParamsAcc}\PY{p}{(}\PY{n}{gammas}\PY{p}{,} \PY{n}{accuracy\PYZus{}list}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{gamma}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{log\PYZus{}scale}\PY{o}{=}\PY{k+kc}{True}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{figure}
        \begin{center}\adjustimage{max size={0.9\linewidth}{0.9\paperheight}}{Step-by-step_intro_to_ML_with_SVC_and_Iris_files/Step-by-step_intro_to_ML_with_SVC_and_Iris_258_0.png}\end{center}
        \caption{The plot shows the variation of the SVC parameter 'gamma' against the prediction accuracy}
        \label{fig:plot_vary_gamma}
    \end{figure}
    
    \hypertarget{vary-c-parameter}{%
\subsection{\texorpdfstring{Vary \texttt{C}
parameter}{Vary C parameter}}\label{vary-c-parameter}}

The \texttt{C} parameter is the \textbf{penalty} of the error term. It
controls the trade off between smooth decision boundary and classifying
the training points correctly. The \textbf{default} is \texttt{C=1.0}.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{107}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{cs} \PY{o}{=} \PY{p}{[}\PY{l+m+mf}{0.1}\PY{p}{,} \PY{l+m+mi}{1}\PY{p}{,} \PY{l+m+mi}{5}\PY{p}{,} \PY{l+m+mi}{10}\PY{p}{,} \PY{l+m+mi}{100}\PY{p}{,} \PY{l+m+mi}{1000}\PY{p}{]}

\PY{n}{xlabel} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Petal length}\PY{l+s+s1}{\PYZsq{}}
\PY{n}{ylabel} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Petal width}\PY{l+s+s1}{\PYZsq{}}

\PY{c+c1}{\PYZsh{} Setup 2x3 grid for plotting}
\PY{n}{fig}\PY{p}{,} \PY{n}{subplots} \PY{o}{=} \PY{n}{plt}\PY{o}{.}\PY{n}{subplots}\PY{p}{(}\PY{l+m+mi}{3}\PY{p}{,} \PY{l+m+mi}{2}\PY{p}{,} \PY{n}{figsize}\PY{o}{=}\PY{p}{(}\PY{l+m+mi}{14}\PY{p}{,} \PY{l+m+mi}{15}\PY{p}{)}\PY{p}{)}
\PY{c+c1}{\PYZsh{} Set margins between subplots}
\PY{n}{plt}\PY{o}{.}\PY{n}{subplots\PYZus{}adjust}\PY{p}{(}\PY{n}{wspace}\PY{o}{=}\PY{l+m+mf}{0.3}\PY{p}{,} \PY{n}{hspace}\PY{o}{=}\PY{l+m+mf}{0.3}\PY{p}{)}

\PY{c+c1}{\PYZsh{} Make subplots iterable via \PYZsq{}subplots.flatten()\PYZsq{}}
\PY{k}{for} \PY{n}{c}\PY{p}{,} \PY{n}{subplot} \PY{o+ow}{in} \PY{n+nb}{zip}\PY{p}{(}\PY{n}{cs}\PY{p}{,} \PY{n}{subplots}\PY{o}{.}\PY{n}{flatten}\PY{p}{(}\PY{p}{)}\PY{p}{)}\PY{p}{:}
    \PY{n}{svc\PYZus{}plot} \PY{o}{=} \PY{n}{svm}\PY{o}{.}\PY{n}{SVC}\PY{p}{(}\PY{n}{kernel}\PY{o}{=}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{rbf}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{C}\PY{o}{=}\PY{n}{c}\PY{p}{)}\PY{o}{.}\PY{n}{fit}\PY{p}{(}\PY{n}{X\PYZus{}plot}\PY{p}{,} \PY{n}{y\PYZus{}plot}\PY{p}{)}
    \PY{n}{accuracy} \PY{o}{=} \PY{n}{crossValSVC}\PY{p}{(}\PY{n}{X\PYZus{}train}\PY{p}{,} \PY{n}{y\PYZus{}train}\PY{p}{,} \PY{n}{kernel}\PY{o}{=}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{rbf}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{C}\PY{o}{=}\PY{n}{c}\PY{p}{)}
    \PY{n}{title\PYZus{}str} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{C: }\PY{l+s+se}{\PYZbs{}\PYZsq{}}\PY{l+s+s1}{\PYZsq{}}\PY{o}{+}\PY{n+nb}{str}\PY{p}{(}\PY{n}{c}\PY{p}{)}\PY{o}{+}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+se}{\PYZbs{}\PYZsq{}}\PY{l+s+s1}{, }\PY{l+s+s1}{\PYZsq{}} \PYZbs{}
                 \PY{o}{+}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Acc. prediction: }\PY{l+s+si}{\PYZob{}:.2f\PYZcb{}}\PY{l+s+s1}{\PYZpc{}}\PY{l+s+s1}{\PYZsq{}}\PY{o}{.}\PY{n}{format}\PY{p}{(}\PY{n}{accuracy}\PY{p}{)}
    \PY{n}{plotSVC}\PY{p}{(}\PY{n}{title\PYZus{}str}\PY{p}{,} \PY{n}{svc\PYZus{}plot}\PY{p}{,} \PY{n}{X\PYZus{}plot}\PY{p}{,} \PY{n}{y\PYZus{}plot}\PY{p}{,} \PY{n}{xlabel}\PY{p}{,} \PY{n}{ylabel}\PY{p}{,} \PY{n}{subplot}\PY{p}{)}

\PY{n}{plt}\PY{o}{.}\PY{n}{show}\PY{p}{(}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{figure}
        \begin{center}\adjustimage{max size={0.9\linewidth}{0.9\paperheight}}{Step-by-step_intro_to_ML_with_SVC_and_Iris_files/Step-by-step_intro_to_ML_with_SVC_and_Iris_260_0.png}\end{center}
        \caption{This group of images shows the effect on the classification by the variation of the parameter 'C' of the 'rbf' kernel}
        \label{fig:vary_c_parameter}
    \end{figure}
    
    Show the variation of the SVC parameter \texttt{C} against the
\textbf{prediction accuracy}.

But be careful: to high \texttt{C} values may lead to
\textbf{overfitting} the training data.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{108}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{cs} \PY{o}{=} \PY{p}{[}\PY{l+m+mf}{0.1}\PY{p}{,} \PY{l+m+mi}{1}\PY{p}{,} \PY{l+m+mi}{5}\PY{p}{,} \PY{l+m+mi}{6}\PY{p}{,} \PY{l+m+mi}{7}\PY{p}{,} \PY{l+m+mi}{8}\PY{p}{,} \PY{l+m+mi}{10}\PY{p}{,} \PY{l+m+mi}{100}\PY{p}{,} \PY{l+m+mi}{1000}\PY{p}{,} \PY{l+m+mi}{10000}\PY{p}{]}

\PY{n}{accuracy\PYZus{}list} \PY{o}{=} \PY{n+nb}{list}\PY{p}{(}\PY{p}{)}
\PY{k}{for} \PY{n}{c} \PY{o+ow}{in} \PY{n}{cs}\PY{p}{:}
    \PY{n}{accuracy} \PY{o}{=} \PY{n}{crossValSVC}\PY{p}{(}\PY{n}{X\PYZus{}train}\PY{p}{,} \PY{n}{y\PYZus{}train}\PY{p}{,} \PY{n}{kernel}\PY{o}{=}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{rbf}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{C}\PY{o}{=}\PY{n}{c}\PY{p}{)}
    \PY{n}{accuracy\PYZus{}list}\PY{o}{.}\PY{n}{append}\PY{p}{(}\PY{n}{accuracy}\PY{p}{)}

\PY{n}{plotParamsAcc}\PY{p}{(}\PY{n}{cs}\PY{p}{,} \PY{n}{accuracy\PYZus{}list}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{C}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{log\PYZus{}scale}\PY{o}{=}\PY{k+kc}{True}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{figure}
        \begin{center}\adjustimage{max size={0.9\linewidth}{0.9\paperheight}}{Step-by-step_intro_to_ML_with_SVC_and_Iris_files/Step-by-step_intro_to_ML_with_SVC_and_Iris_262_0.png}\end{center}
        \caption{The plot shows the variation of the SVC parameter 'C' against the prediction accuracy}
        \label{fig:plot_vary_c}
    \end{figure}
    
    \hypertarget{vary-degree-parameter}{%
\subsection{\texorpdfstring{Vary \texttt{degree}
parameter}{Vary degree parameter}}\label{vary-degree-parameter}}

The \texttt{degree} parameter is used when the \texttt{kernel} is set to
\texttt{poly} and is ignored by all other kernels. It's basically the
\textbf{degree of the polynomial} used to find the hyperplane to split
the data. The \textbf{default} is \texttt{degree=3}.

Using \texttt{degree\ =\ 1} is the same as using a \texttt{linear}
kernel. Also, increasing this parameters leads to \textbf{higher
training times}.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{109}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{degrees} \PY{o}{=} \PY{p}{[}\PY{l+m+mi}{1}\PY{p}{,} \PY{l+m+mi}{2}\PY{p}{,} \PY{l+m+mi}{3}\PY{p}{,} \PY{l+m+mi}{4}\PY{p}{,} \PY{l+m+mi}{5}\PY{p}{,} \PY{l+m+mi}{6}\PY{p}{]}

\PY{n}{xlabel} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Petal length}\PY{l+s+s1}{\PYZsq{}}
\PY{n}{ylabel} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Petal width}\PY{l+s+s1}{\PYZsq{}}

\PY{c+c1}{\PYZsh{} Setup 2x3 grid for plotting}
\PY{n}{fig}\PY{p}{,} \PY{n}{subplots} \PY{o}{=} \PY{n}{plt}\PY{o}{.}\PY{n}{subplots}\PY{p}{(}\PY{l+m+mi}{3}\PY{p}{,} \PY{l+m+mi}{2}\PY{p}{,} \PY{n}{figsize}\PY{o}{=}\PY{p}{(}\PY{l+m+mi}{14}\PY{p}{,} \PY{l+m+mi}{15}\PY{p}{)}\PY{p}{)}
\PY{c+c1}{\PYZsh{} Set margins between subplots}
\PY{n}{plt}\PY{o}{.}\PY{n}{subplots\PYZus{}adjust}\PY{p}{(}\PY{n}{wspace}\PY{o}{=}\PY{l+m+mf}{0.3}\PY{p}{,} \PY{n}{hspace}\PY{o}{=}\PY{l+m+mf}{0.3}\PY{p}{)}

\PY{c+c1}{\PYZsh{} Make subplots iterable via \PYZsq{}subplots.flatten()\PYZsq{}}
\PY{k}{for} \PY{n}{degree}\PY{p}{,} \PY{n}{subplot} \PY{o+ow}{in} \PY{n+nb}{zip}\PY{p}{(}\PY{n}{degrees}\PY{p}{,} \PY{n}{subplots}\PY{o}{.}\PY{n}{flatten}\PY{p}{(}\PY{p}{)}\PY{p}{)}\PY{p}{:}
    \PY{n}{svc\PYZus{}plot} \PY{o}{=} \PY{n}{svm}\PY{o}{.}\PY{n}{SVC}\PY{p}{(}\PY{n}{kernel}\PY{o}{=}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{poly}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{degree}\PY{o}{=}\PY{n}{degree}\PY{p}{)}\PY{o}{.}\PY{n}{fit}\PY{p}{(}\PY{n}{X\PYZus{}plot}\PY{p}{,} \PY{n}{y\PYZus{}plot}\PY{p}{)}
    \PY{n}{accuracy} \PY{o}{=} \PY{n}{crossValSVC}\PY{p}{(}\PY{n}{X\PYZus{}train}\PY{p}{,} \PY{n}{y\PYZus{}train}\PY{p}{,} \PY{n}{kernel}\PY{o}{=}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{poly}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{degree}\PY{o}{=}\PY{n}{degree}\PY{p}{)}
    \PY{n}{title\PYZus{}str} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{degree: }\PY{l+s+se}{\PYZbs{}\PYZsq{}}\PY{l+s+s1}{\PYZsq{}}\PY{o}{+}\PY{n+nb}{str}\PY{p}{(}\PY{n}{degree}\PY{p}{)}\PY{o}{+}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+se}{\PYZbs{}\PYZsq{}}\PY{l+s+s1}{, }\PY{l+s+s1}{\PYZsq{}} \PYZbs{}
                 \PY{o}{+}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Acc. prediction: }\PY{l+s+si}{\PYZob{}:.2f\PYZcb{}}\PY{l+s+s1}{\PYZpc{}}\PY{l+s+s1}{\PYZsq{}}\PY{o}{.}\PY{n}{format}\PY{p}{(}\PY{n}{accuracy}\PY{p}{)}
    \PY{n}{plotSVC}\PY{p}{(}\PY{n}{title\PYZus{}str}\PY{p}{,} \PY{n}{svc\PYZus{}plot}\PY{p}{,} \PY{n}{X\PYZus{}plot}\PY{p}{,} \PY{n}{y\PYZus{}plot}\PY{p}{,} \PY{n}{xlabel}\PY{p}{,} \PY{n}{ylabel}\PY{p}{,} \PY{n}{subplot}\PY{p}{)}

\PY{n}{plt}\PY{o}{.}\PY{n}{show}\PY{p}{(}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{figure}
        \begin{center}\adjustimage{max size={0.9\linewidth}{0.9\paperheight}}{Step-by-step_intro_to_ML_with_SVC_and_Iris_files/Step-by-step_intro_to_ML_with_SVC_and_Iris_264_0.png}\end{center}
        \caption{This group of images shows the effect on the classification by the variation of the parameter 'degree' of the 'poly' kernel}
        \label{fig:vary_degree_parameter}
    \end{figure}
    
    Show the variation of the SVC parameter \texttt{degree} against the
\textbf{prediction accuracy}.

As we can see, increasing the \texttt{degree} of the polynomial
hyperplane leads to \textbf{overfitting} the training data.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{110}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{degrees} \PY{o}{=} \PY{p}{[}\PY{l+m+mi}{1}\PY{p}{,} \PY{l+m+mi}{2}\PY{p}{,} \PY{l+m+mi}{3}\PY{p}{,} \PY{l+m+mi}{4}\PY{p}{,} \PY{l+m+mi}{5}\PY{p}{,} \PY{l+m+mi}{6}\PY{p}{,} \PY{l+m+mi}{7}\PY{p}{,} \PY{l+m+mi}{8}\PY{p}{,} \PY{l+m+mi}{9}\PY{p}{,} \PY{l+m+mi}{10}\PY{p}{]}

\PY{n}{accuracy\PYZus{}list} \PY{o}{=} \PY{n+nb}{list}\PY{p}{(}\PY{p}{)}
\PY{k}{for} \PY{n}{degree} \PY{o+ow}{in} \PY{n}{degrees}\PY{p}{:}
    \PY{n}{accuracy} \PY{o}{=} \PY{n}{crossValSVC}\PY{p}{(}\PY{n}{X\PYZus{}train}\PY{p}{,} \PY{n}{y\PYZus{}train}\PY{p}{,} \PY{n}{kernel}\PY{o}{=}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{poly}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{degree}\PY{o}{=}\PY{n}{degree}\PY{p}{)}
    \PY{n}{accuracy\PYZus{}list}\PY{o}{.}\PY{n}{append}\PY{p}{(}\PY{n}{accuracy}\PY{p}{)}

\PY{n}{plotParamsAcc}\PY{p}{(}\PY{n}{degrees}\PY{p}{,} \PY{n}{accuracy\PYZus{}list}\PY{p}{,} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{degree}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{log\PYZus{}scale}\PY{o}{=}\PY{k+kc}{False}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{figure}
        \begin{center}\adjustimage{max size={0.9\linewidth}{0.9\paperheight}}{Step-by-step_intro_to_ML_with_SVC_and_Iris_files/Step-by-step_intro_to_ML_with_SVC_and_Iris_266_0.png}\end{center}
        \caption{The plot shows the variation of the SVC parameter 'degree' against the prediction accuracy}
        \label{fig:plot_vary_degree}
    \end{figure}
    
    \hypertarget{step-8-tune-the-ml-model-systematically}{%
\section{STEP 8: Tune the ML model
systematically}\label{step-8-tune-the-ml-model-systematically}}

In the final step, two approaches to systematic hyper-parameter search
are presented: \textbf{Grid Search} and \textbf{Randomized Search}.
While the former exhaustively considers all parameter combinations for
given values, the latter selects a number of candidates from a parameter
space with a particular random distribution.

Sources:

\begin{itemize}
\tightlist
\item
  \href{https://scikit-learn.org/stable/modules/grid_search.html}{3.2.
  Tuning the hyper-parameters of an estimator}

  \begin{itemize}
  \tightlist
  \item
    \href{https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html}{sklearn.model\_selection.GridSearchCV}
  \item
    \href{https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html\#sklearn.model_selection.RandomizedSearchCV}{sklearn.model\_selection.RandomizedSearchCV}
  \end{itemize}
\item
  \href{https://pyimagesearch.com/2021/05/17/introduction-to-hyperparameter-tuning-with-scikit-learn-and-python/}{Introduction
  to hyperparameter tuning with scikit-learn and Python}

  \begin{itemize}
  \tightlist
  \item
    \href{https://www.kaggle.com/datasets/rodolfomendes/abalone-dataset?resource=download}{Abalone
    Dataset}
  \end{itemize}
\item
  \href{https://medium.com/@jackstalfort/hyperparameter-tuning-using-grid-search-and-random-search-f8750a464b35}{Hyperparameter
  tuning using Grid Search and Random Search: A Conceptual Guide}
\end{itemize}

    Import the necessary packages:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{111}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} general packages}
\PY{k+kn}{from} \PY{n+nn}{sklearn}\PY{n+nn}{.}\PY{n+nn}{preprocessing} \PY{k+kn}{import} \PY{n}{StandardScaler}
\PY{k+kn}{from} \PY{n+nn}{sklearn}\PY{n+nn}{.}\PY{n+nn}{model\PYZus{}selection} \PY{k+kn}{import} \PY{n}{train\PYZus{}test\PYZus{}split}
\PY{k+kn}{from} \PY{n+nn}{sklearn}\PY{n+nn}{.}\PY{n+nn}{model\PYZus{}selection} \PY{k+kn}{import} \PY{n}{cross\PYZus{}val\PYZus{}score}
\PY{k+kn}{from} \PY{n+nn}{sklearn}\PY{n+nn}{.}\PY{n+nn}{metrics} \PY{k+kn}{import} \PY{n}{accuracy\PYZus{}score}
\PY{k+kn}{from} \PY{n+nn}{sklearn}\PY{n+nn}{.}\PY{n+nn}{metrics} \PY{k+kn}{import} \PY{n}{classification\PYZus{}report}
\PY{c+c1}{\PYZsh{}from sklearn.svm import SVC}
\PY{k+kn}{from} \PY{n+nn}{sklearn} \PY{k+kn}{import} \PY{n}{svm}\PY{p}{,} \PY{n}{metrics}
\PY{k+kn}{import} \PY{n+nn}{pandas} \PY{k}{as} \PY{n+nn}{pd}
\PY{k+kn}{import} \PY{n+nn}{seaborn} \PY{k}{as} \PY{n+nn}{sns}
\PY{k+kn}{import} \PY{n+nn}{matplotlib}\PY{n+nn}{.}\PY{n+nn}{pyplot} \PY{k}{as} \PY{n+nn}{plt}
\PY{o}{\PYZpc{}}\PY{k}{matplotlib} inline

\PY{c+c1}{\PYZsh{} additional packages for grid search}
\PY{k+kn}{from} \PY{n+nn}{sklearn}\PY{n+nn}{.}\PY{n+nn}{model\PYZus{}selection} \PY{k+kn}{import} \PY{n}{RepeatedKFold}
\PY{k+kn}{from} \PY{n+nn}{sklearn}\PY{n+nn}{.}\PY{n+nn}{model\PYZus{}selection} \PY{k+kn}{import} \PY{n}{GridSearchCV}

\PY{c+c1}{\PYZsh{} additional packages for randomized search}
\PY{k+kn}{from} \PY{n+nn}{sklearn}\PY{n+nn}{.}\PY{n+nn}{model\PYZus{}selection} \PY{k+kn}{import} \PY{n}{RandomizedSearchCV}
\PY{k+kn}{from} \PY{n+nn}{sklearn}\PY{n+nn}{.}\PY{n+nn}{model\PYZus{}selection} \PY{k+kn}{import} \PY{n}{RepeatedKFold}

\PY{c+c1}{\PYZsh{} import class MeasExecTimeOfProgram from python file MeasExecTimeOfProgramclass.py}
\PY{k+kn}{from} \PY{n+nn}{MeasExecTimeOfProgram\PYZus{}class} \PY{k+kn}{import} \PY{n}{MeasExecTimeOfProgram}
\end{Verbatim}
\end{tcolorbox}

    Set path and columns of the Iris dataset for import:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{112}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Path of the ORIGINAL Iris dataset for classification}
\PY{c+c1}{\PYZsh{}CSV\PYZus{}PATH = \PYZdq{}./datasets/IRIS\PYZus{}flower\PYZus{}dataset\PYZus{}kaggle.csv\PYZdq{}}

\PY{c+c1}{\PYZsh{} Path of the NOISED Iris dataset for classification}
\PY{n}{CSV\PYZus{}PATH} \PY{o}{=} \PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{./datasets/IRIS\PYZus{}flower\PYZus{}dataset\PYZus{}kaggle\PYZus{}noised.csv}\PY{l+s+s2}{\PYZdq{}}
\end{Verbatim}
\end{tcolorbox}

    Load dataset and split it into subsets for training and testing in the
ratio 80\% to 20\%:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{113}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} load the dataset, separate the features and labels, and perform a}
\PY{c+c1}{\PYZsh{} training and testing split using 80\PYZpc{} of the data for training and}
\PY{c+c1}{\PYZsh{} 20\PYZpc{} for evaluation}
\PY{n}{irisdata\PYZus{}df} \PY{o}{=} \PY{n}{pd}\PY{o}{.}\PY{n}{read\PYZus{}csv}\PY{p}{(}\PY{n}{CSV\PYZus{}PATH}\PY{p}{)}

\PY{n}{X} \PY{o}{=} \PY{n}{irisdata\PYZus{}df}\PY{o}{.}\PY{n}{drop}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{species}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{axis}\PY{o}{=}\PY{l+m+mi}{1}\PY{p}{)}
\PY{n}{y} \PY{o}{=} \PY{n}{irisdata\PYZus{}df}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{species}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}

\PY{n}{X\PYZus{}train}\PY{p}{,} \PY{n}{X\PYZus{}test}\PY{p}{,} \PY{n}{y\PYZus{}train}\PY{p}{,} \PY{n}{y\PYZus{}test} \PY{o}{=} \PY{n}{train\PYZus{}test\PYZus{}split}\PY{p}{(}\PY{n}{X}\PY{p}{,} \PY{n}{y}\PY{p}{,} 
                                                    \PY{n}{test\PYZus{}size} \PY{o}{=} \PY{l+m+mf}{0.20}\PY{p}{,} 
                                                    \PY{n}{shuffle}\PY{o}{=}\PY{k+kc}{True}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    Check that the split datasets are still balanced and that no
\textbf{bias} has been created by the splitting.

For this test, the previously separated labels \texttt{y\_train} must be
added back to the training dataset \texttt{X\_train}.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{114}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} make a deep copy of \PYZsq{}X\PYZus{}train\PYZsq{}}
\PY{n}{X\PYZus{}train\PYZus{}bias\PYZus{}test\PYZus{}df} \PY{o}{=} \PY{n}{X\PYZus{}train}\PY{o}{.}\PY{n}{copy}\PY{p}{(}\PY{n}{deep}\PY{o}{=}\PY{k+kc}{True}\PY{p}{)}

\PY{c+c1}{\PYZsh{} add list of labels to test dataframe}
\PY{n}{X\PYZus{}train\PYZus{}bias\PYZus{}test\PYZus{}df}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{species}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]} \PY{o}{=} \PY{n}{y\PYZus{}train}

\PY{c+c1}{\PYZsh{} count unique values without missing values in a column, }
\PY{c+c1}{\PYZsh{} ordered descending and normalized}
\PY{n}{X\PYZus{}train\PYZus{}bias\PYZus{}test\PYZus{}df}\PY{p}{[}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{species}\PY{l+s+s1}{\PYZsq{}}\PY{p}{]}\PY{o}{.}\PY{n}{value\PYZus{}counts}\PY{p}{(}\PY{n}{ascending}\PY{o}{=}\PY{k+kc}{False}\PY{p}{,} 
                                             \PY{n}{dropna}\PY{o}{=}\PY{k+kc}{False}\PY{p}{,} 
                                             \PY{n}{normalize}\PY{o}{=}\PY{k+kc}{True}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

            \begin{tcolorbox}[breakable, size=fbox, boxrule=.5pt, pad at break*=1mm, opacityfill=0]
\prompt{Out}{outcolor}{114}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
Iris-versicolor    0.35
Iris-virginica     0.35
Iris-setosa        0.30
Name: species, dtype: float64
\end{Verbatim}
\end{tcolorbox}
        
    Standardize the feature values by computing the \textbf{mean},
subtracting the mean from the data points, and then dividing by the
\textbf{standard deviation}:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{115}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{scaler} \PY{o}{=} \PY{n}{StandardScaler}\PY{p}{(}\PY{p}{)}
\PY{n}{X\PYZus{}train} \PY{o}{=} \PY{n}{scaler}\PY{o}{.}\PY{n}{fit\PYZus{}transform}\PY{p}{(}\PY{n}{X\PYZus{}train}\PY{p}{)}
\PY{n}{X\PYZus{}test} \PY{o}{=} \PY{n}{scaler}\PY{o}{.}\PY{n}{transform}\PY{p}{(}\PY{n}{X\PYZus{}test}\PY{p}{)}

\PY{c+c1}{\PYZsh{}X\PYZus{}train}
\end{Verbatim}
\end{tcolorbox}

    \hypertarget{finding-a-baseline}{%
\subsection{Finding a baseline}\label{finding-a-baseline}}

The aim of this sub-step is to establish a baseline on the Iris dataset
by training a \textbf{Support Vector Classifier (SVC)} with no
hyperparameter tuning.

Train the model with \textbf{no tuning of hyperparameters} to find the
baseline for later improvements:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{116}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{classifier} \PY{o}{=} \PY{n}{svm}\PY{o}{.}\PY{n}{SVC}\PY{p}{(}\PY{n}{kernel} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{linear}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{random\PYZus{}state} \PY{o}{=} \PY{l+m+mi}{0}\PY{p}{)}

\PY{c+c1}{\PYZsh{} initiate measuring execution time}
\PY{n}{execTime} \PY{o}{=} \PY{n}{MeasExecTimeOfProgram}\PY{p}{(}\PY{p}{)}
\PY{n}{execTime}\PY{o}{.}\PY{n}{start}\PY{p}{(}\PY{p}{)}

\PY{n}{classifier}\PY{o}{.}\PY{n}{fit}\PY{p}{(}\PY{n}{X\PYZus{}train}\PY{p}{,} \PY{n}{y\PYZus{}train}\PY{p}{)}

\PY{c+c1}{\PYZsh{} print time delta}
\PY{n+nb}{print}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Execution time: }\PY{l+s+si}{\PYZob{}:.4f\PYZcb{}}\PY{l+s+s1}{ ms}\PY{l+s+s1}{\PYZsq{}}\PY{o}{.}\PY{n}{format}\PY{p}{(}\PY{n}{execTime}\PY{o}{.}\PY{n}{stop}\PY{p}{(}\PY{p}{)}\PY{p}{)}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{Verbatim}[commandchars=\\\{\}]
Execution time: 0.7720 ms
    \end{Verbatim}

    Evaluate our model using accuracy score:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{117}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} predict labels}
\PY{n}{y\PYZus{}pred} \PY{o}{=} \PY{n}{classifier}\PY{o}{.}\PY{n}{predict}\PY{p}{(}\PY{n}{X\PYZus{}test}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{118}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} calculate cross validation score}
\PY{c+c1}{\PYZsh{} HINT: do NOT use the accuracy score \PYZhy{} it\PYZsq{}s to inaccurate!}
\PY{n}{accuracies} \PY{o}{=} \PY{n}{cross\PYZus{}val\PYZus{}score}\PY{p}{(}\PY{n}{estimator} \PY{o}{=} \PY{n}{classifier}\PY{p}{,} \PY{n}{X} \PY{o}{=} \PY{n}{X\PYZus{}train}\PY{p}{,} 
                             \PY{n}{y} \PY{o}{=} \PY{n}{y\PYZus{}train}\PY{p}{,} \PY{n}{cv} \PY{o}{=} \PY{l+m+mi}{10}\PY{p}{)}

\PY{n+nb}{print}\PY{p}{(}\PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{Cross\PYZhy{}validation score: }\PY{l+s+si}{\PYZob{}:.2f\PYZcb{}}\PY{l+s+s2}{ }\PY{l+s+s2}{\PYZpc{}}\PY{l+s+s2}{\PYZdq{}}\PY{o}{.}\PY{n}{format}\PY{p}{(}\PY{n}{accuracies}\PY{o}{.}\PY{n}{mean}\PY{p}{(}\PY{p}{)}\PY{o}{*}\PY{l+m+mi}{100}\PY{p}{)}\PY{p}{)}
\PY{n+nb}{print}\PY{p}{(}\PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{Standard Deviation: }\PY{l+s+si}{\PYZob{}:.2f\PYZcb{}}\PY{l+s+s2}{ }\PY{l+s+s2}{\PYZpc{}}\PY{l+s+s2}{\PYZdq{}}\PY{o}{.}\PY{n}{format}\PY{p}{(}\PY{n}{accuracies}\PY{o}{.}\PY{n}{std}\PY{p}{(}\PY{p}{)}\PY{o}{*}\PY{l+m+mi}{100}\PY{p}{)}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{Verbatim}[commandchars=\\\{\}]
Cross-validation score: 95.83 \%
Standard Deviation: 5.59 \%
    \end{Verbatim}

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{119}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} print classification report}
\PY{n+nb}{print}\PY{p}{(}\PY{n}{classification\PYZus{}report}\PY{p}{(}\PY{n}{y\PYZus{}test}\PY{p}{,} \PY{n}{y\PYZus{}pred}\PY{p}{)}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{Verbatim}[commandchars=\\\{\}]
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00        14
Iris-versicolor       0.88      0.88      0.88         8
 Iris-virginica       0.88      0.88      0.88         8

       accuracy                           0.93        30
      macro avg       0.92      0.92      0.92        30
   weighted avg       0.93      0.93      0.93        30

    \end{Verbatim}

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{120}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{sns}\PY{o}{.}\PY{n}{set\PYZus{}style}\PY{p}{(}\PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{white}\PY{l+s+s2}{\PYZdq{}}\PY{p}{)}

\PY{c+c1}{\PYZsh{} print colored confusion matrix}
\PY{n}{cm\PYZus{}colored} \PY{o}{=} \PY{n}{metrics}\PY{o}{.}\PY{n}{ConfusionMatrixDisplay}\PY{o}{.}\PY{n}{from\PYZus{}predictions}\PY{p}{(}\PY{n}{y\PYZus{}test}\PY{p}{,} \PY{n}{y\PYZus{}pred}\PY{p}{)}

\PY{n}{cm\PYZus{}colored}\PY{o}{.}\PY{n}{figure\PYZus{}}\PY{o}{.}\PY{n}{set\PYZus{}figwidth}\PY{p}{(}\PY{l+m+mi}{8}\PY{p}{)}
\PY{n}{cm\PYZus{}colored}\PY{o}{.}\PY{n}{figure\PYZus{}}\PY{o}{.}\PY{n}{set\PYZus{}figheight}\PY{p}{(}\PY{l+m+mi}{7}\PY{p}{)}

\PY{n}{cm\PYZus{}colored}\PY{o}{.}\PY{n}{confusion\PYZus{}matrix}

\PY{c+c1}{\PYZsh{} y .. padding between title and plot}
\PY{n}{plt}\PY{o}{.}\PY{n}{title}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Colored Confusion Matrix}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{y}\PY{o}{=}\PY{l+m+mf}{1.1}\PY{p}{)}

\PY{n}{plt}\PY{o}{.}\PY{n}{tight\PYZus{}layout}\PY{p}{(}\PY{p}{)}
\PY{n}{plt}\PY{o}{.}\PY{n}{show}\PY{p}{(}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{figure}
        \begin{center}\includegraphics[scale=0.6]{Step-by-step_intro_to_ML_with_SVC_and_Iris_files/Step-by-step_intro_to_ML_with_SVC_and_Iris_284_0.png}\end{center}
        \caption{Confusion matrix for cross-validation of the baseline}
        \label{fig:cm_baseline}
    \end{figure}
    
    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{121}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{classifier}\PY{o}{.}\PY{n}{get\PYZus{}params}\PY{p}{(}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

            \begin{tcolorbox}[breakable, size=fbox, boxrule=.5pt, pad at break*=1mm, opacityfill=0]
\prompt{Out}{outcolor}{121}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\{'C': 1.0,
 'break\_ties': False,
 'cache\_size': 200,
 'class\_weight': None,
 'coef0': 0.0,
 'decision\_function\_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'linear',
 'max\_iter': -1,
 'probability': False,
 'random\_state': 0,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False\}
\end{Verbatim}
\end{tcolorbox}
        
    \hypertarget{add-gaussian-noise-to-iris-dataset}{%
\subsection{Add Gaussian noise to Iris
dataset}\label{add-gaussian-noise-to-iris-dataset}}

Recording \textbf{datasets from real applications} is always associated
with several problems. Real measured values are always subject to a
certain \textbf{level of measurement noise}. Furthermore, when recording
the measured values, there may be sporadic dropouts of the measurement
sensors, which leads to \textbf{gaps in the dataset}. And finally,
\textbf{doubles}, i.e.~several identical measurements, can occur when
merging several measurement series from different experiments.

These problems from the real measuring everyday life are to be shown by
the example of the Iris dataset. Unfortunately, this dataset is a little
``too perfect''. To simulate \textbf{real measurement values}, some
\textbf{Gaussian noise} with a defined \textbf{standard deviation
\(\sigma\)} is added to the features of the Iris dataset. To simulate an
\textbf{offset} due to \textbf{imperfectly calibrated measurement
devices}, for example, the mean value could additionally be shifted.
However, because this has \textbf{no influence on the classifiability},
it is omitted here.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{122}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Import Iris dataset for adding noise}
\PY{n}{irisdata\PYZus{}df\PYZus{}orig} \PY{o}{=} \PY{n}{pd}\PY{o}{.}\PY{n}{read\PYZus{}csv}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{./datasets/IRIS\PYZus{}flower\PYZus{}dataset\PYZus{}kaggle.csv}\PY{l+s+s1}{\PYZsq{}}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    First, determine the shape of the Iris dataset. The last column with the
class names is omitted to get only the feature columns.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{123}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Get number of rows of the dataset}
\PY{n}{n\PYZus{}rows} \PY{o}{=} \PY{n}{irisdata\PYZus{}df\PYZus{}orig}\PY{o}{.}\PY{n}{shape}\PY{p}{[}\PY{l+m+mi}{0}\PY{p}{]}

\PY{c+c1}{\PYZsh{} Get number of columns of the dataset}
\PY{c+c1}{\PYZsh{} Omit last column with the class names}
\PY{n}{n\PYZus{}cols} \PY{o}{=} \PY{n}{irisdata\PYZus{}df\PYZus{}orig}\PY{o}{.}\PY{n}{shape}\PY{p}{[}\PY{l+m+mi}{1}\PY{p}{]} \PY{o}{\PYZhy{}} \PY{l+m+mi}{1}
\end{Verbatim}
\end{tcolorbox}

    Now a \texttt{numpy} array in the shape of the Iris dataset is
generated. This contains \textbf{normally distributed random values}
according to the \textbf{Gaussian curve} with a defined \textbf{standard
deviation \(\sigma\)}. The \textbf{mean} of the Gaussian curve remains
unchanged and is \textbf{not shifted} in the first place.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{124}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} mean: \PYZdq{}centre\PYZdq{} of the distribution}
\PY{c+c1}{\PYZsh{} sigma: standard deviation (spread or “width”) of the distribution}
\PY{n}{mean}\PY{p}{,} \PY{n}{sigma} \PY{o}{=} \PY{l+m+mi}{0}\PY{p}{,} \PY{l+m+mf}{0.2}

\PY{c+c1}{\PYZsh{} Create noise with the same dimension as the dataset}
\PY{c+c1}{\PYZsh{} Set \PYZsq{}seed\PYZsq{} to something, to make the output of the random generator reproducible}
\PY{n}{np}\PY{o}{.}\PY{n}{random}\PY{o}{.}\PY{n}{seed}\PY{p}{(}\PY{l+m+mi}{42}\PY{p}{)}
\PY{n}{irisdata\PYZus{}np\PYZus{}noise} \PY{o}{=} \PY{n}{np}\PY{o}{.}\PY{n}{random}\PY{o}{.}\PY{n}{normal}\PY{p}{(}\PY{n}{mean}\PY{p}{,} \PY{n}{sigma}\PY{p}{,} \PY{p}{(}\PY{n}{n\PYZus{}rows}\PY{p}{,} \PY{n}{n\PYZus{}cols}\PY{p}{)}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    The first 4 columns of the dataframe containing the original Iris
dataset are converted to a \texttt{numpy} array.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{125}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Select columns 1\PYZhy{}4 with all rows}
\PY{c+c1}{\PYZsh{} and convert it to numpy array}
\PY{n}{irisdata\PYZus{}np\PYZus{}orig} \PY{o}{=} \PY{n}{irisdata\PYZus{}df\PYZus{}orig}\PY{o}{.}\PY{n}{iloc}\PY{p}{[}\PY{p}{:}\PY{p}{,} \PY{l+m+mi}{0}\PY{p}{:}\PY{l+m+mi}{4}\PY{p}{]}\PY{o}{.}\PY{n}{to\PYZus{}numpy}\PY{p}{(}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    The array with the normally distributed random values is added to this.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{126}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Add noise to original values}
\PY{n}{irisdata\PYZus{}np\PYZus{}noised} \PY{o}{=} \PY{n}{irisdata\PYZus{}np\PYZus{}orig} \PY{o}{+} \PY{n}{irisdata\PYZus{}np\PYZus{}noise}
\end{Verbatim}
\end{tcolorbox}

    \textbf{Negative measured values} do \textbf{not make sense} for this
dataset and should therefore be avoided. Therefore, the \textbf{minimum
value} over the entire array is first retrieved with the function
\texttt{numpy.amin()}. If this is negative, an \textbf{integer offset}
for shifting the data into the positive range is calculated from the
rounded up amount of the minimum value. The function
\texttt{math.ceil()} is used for this.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{127}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{k+kn}{import} \PY{n+nn}{math}

\PY{n}{min\PYZus{}val} \PY{o}{=} \PY{n}{np}\PY{o}{.}\PY{n}{amin}\PY{p}{(}\PY{n}{irisdata\PYZus{}np\PYZus{}noised}\PY{p}{)}
\PY{n+nb}{print}\PY{p}{(}\PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{Minimal value of noised array: }\PY{l+s+si}{\PYZob{}\PYZcb{}}\PY{l+s+s2}{\PYZdq{}}\PY{o}{.}\PY{n}{format}\PY{p}{(}\PY{n}{min\PYZus{}val}\PY{p}{)}\PY{p}{)}

\PY{k}{if} \PY{p}{(}\PY{n}{min\PYZus{}val} \PY{o}{\PYZlt{}} \PY{l+m+mi}{0}\PY{p}{)}\PY{p}{:}
    \PY{n}{min\PYZus{}val\PYZus{}abs} \PY{o}{=} \PY{n+nb}{abs}\PY{p}{(}\PY{n}{min\PYZus{}val}\PY{p}{)}

    \PY{c+c1}{\PYZsh{} Round the min\PYZus{}val\PYZus{}abs upward to its nearest integer}
    \PY{n}{offset} \PY{o}{=} \PY{n}{math}\PY{o}{.}\PY{n}{ceil}\PY{p}{(}\PY{n}{min\PYZus{}val\PYZus{}abs}\PY{p}{)}
\PY{k}{else}\PY{p}{:}
    \PY{n}{offset} \PY{o}{=} \PY{l+m+mi}{0}

\PY{n+nb}{print}\PY{p}{(}\PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{Offset for array shifting: }\PY{l+s+si}{\PYZob{}\PYZcb{}}\PY{l+s+s2}{\PYZdq{}}\PY{o}{.}\PY{n}{format}\PY{p}{(}\PY{n}{offset}\PY{p}{)}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{Verbatim}[commandchars=\\\{\}]
Minimal value of noised array: -0.14617286328679105
Offset for array shifting: 1
    \end{Verbatim}

    The calculated \textbf{offset} is \textbf{added} to the array with the
noisy measurements to \textbf{shift} the data into the \textbf{positive
range}.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{128}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{irisdata\PYZus{}np\PYZus{}noised\PYZus{}shifted} \PY{o}{=} \PY{n}{irisdata\PYZus{}np\PYZus{}noised} \PY{o}{+} \PY{n}{offset}
\PY{c+c1}{\PYZsh{}irisdata\PYZus{}np\PYZus{}noised\PYZus{}shifted}
\end{Verbatim}
\end{tcolorbox}

    Finally, a deep copy of the original dataframe is created. In the copy,
the first 4 columns are replaced with the noisy features.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{129}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Make a deep copy of original dataframe}
\PY{n}{irisdata\PYZus{}df\PYZus{}noised} \PY{o}{=} \PY{n}{irisdata\PYZus{}df\PYZus{}orig}\PY{o}{.}\PY{n}{copy}\PY{p}{(}\PY{n}{deep}\PY{o}{=}\PY{k+kc}{True}\PY{p}{)}

\PY{c+c1}{\PYZsh{} Replace values of dataframe with noisy values from array}
\PY{n}{irisdata\PYZus{}df\PYZus{}noised}\PY{o}{.}\PY{n}{iloc}\PY{p}{[}\PY{p}{:}\PY{p}{,} \PY{l+m+mi}{0}\PY{p}{:}\PY{l+m+mi}{4}\PY{p}{]} \PY{o}{=} \PY{n}{irisdata\PYZus{}np\PYZus{}noised\PYZus{}shifted}

\PY{n}{str\PYZus{}caption} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Get some basic statistical data of the noised Iris dataframe}\PY{l+s+s1}{\PYZsq{}}
\PY{n}{func\PYZus{}render\PYZus{}dataframe2Markdown}\PY{p}{(}\PY{n}{irisdata\PYZus{}df\PYZus{}noised}\PY{o}{.}\PY{n}{describe}\PY{p}{(}\PY{p}{)}\PY{p}{,} \PY{n}{str\PYZus{}caption}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{longtable}[]{@{}
  >{\raggedright\arraybackslash}p{(\columnwidth - 8\tabcolsep) * \real{0.1014}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 8\tabcolsep) * \real{0.2319}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 8\tabcolsep) * \real{0.2174}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 8\tabcolsep) * \real{0.2319}}
  >{\raggedleft\arraybackslash}p{(\columnwidth - 8\tabcolsep) * \real{0.2174}}@{}}
\caption{Get some basic statistical data of the noised Iris
dataframe}\tabularnewline
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedright
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_width
\end{minipage} \\
\midrule\noalign{}
\endfirsthead
\toprule\noalign{}
\begin{minipage}[b]{\linewidth}\raggedright
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
sepal\_width
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_length
\end{minipage} & \begin{minipage}[b]{\linewidth}\raggedleft
petal\_width
\end{minipage} \\
\midrule\noalign{}
\endhead
\bottomrule\noalign{}
\endlastfoot
count & 150 & 150 & 150 & 150 \\
mean & 6.84144 & 4.04848 & 4.75625 & 2.19769 \\
std & 0.869432 & 0.473661 & 1.78664 & 0.771369 \\
min & 5.16462 & 2.97705 & 1.8785 & 0.853827 \\
25\% & 6.13114 & 3.73682 & 2.63002 & 1.40023 \\
50\% & 6.77653 & 3.96714 & 5.34263 & 2.37689 \\
75\% & 7.46866 & 4.366 & 6.11382 & 2.78839 \\
max & 9.03743 & 5.36287 & 8.22899 & 3.75414 \\
\end{longtable}

    
    To compare the original Iris dataset with its noisy copy, both
dataframes are visualized in pairs plots.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{130}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Define a function to visualize data as pairs plots}
\PY{k}{def} \PY{n+nf}{plotPairs}\PY{p}{(}\PY{n}{df}\PY{p}{,} \PY{n}{title}\PY{p}{)}\PY{p}{:}
    \PY{n}{g} \PY{o}{=} \PY{n}{sns}\PY{o}{.}\PY{n}{pairplot}\PY{p}{(}\PY{n}{df}\PY{p}{,} \PY{n}{diag\PYZus{}kind}\PY{o}{=}\PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{kde}\PY{l+s+s2}{\PYZdq{}}\PY{p}{,} \PY{n}{hue}\PY{o}{=}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{species}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} 
                     \PY{n}{palette}\PY{o}{=}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Dark2}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{height}\PY{o}{=}\PY{l+m+mf}{2.0}\PY{p}{)}

    \PY{n}{g}\PY{o}{.}\PY{n}{map\PYZus{}lower}\PY{p}{(}\PY{n}{sns}\PY{o}{.}\PY{n}{kdeplot}\PY{p}{,} \PY{n}{levels}\PY{o}{=}\PY{l+m+mi}{4}\PY{p}{,} \PY{n}{color}\PY{o}{=}\PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{.2}\PY{l+s+s2}{\PYZdq{}}\PY{p}{)}
    \PY{c+c1}{\PYZsh{} x, y .. padding between title and plot}
    \PY{n}{plt}\PY{o}{.}\PY{n}{title}\PY{p}{(}\PY{n}{title}\PY{p}{,} \PY{n}{x}\PY{o}{=}\PY{o}{\PYZhy{}}\PY{l+m+mf}{1.0}\PY{p}{,} \PY{n}{y}\PY{o}{=}\PY{l+m+mf}{4.3}\PY{p}{)}
    \PY{n}{plt}\PY{o}{.}\PY{n}{show}\PY{p}{(}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{131}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{title} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Pairs plot of the ORIGINAL Iris dataset}\PY{l+s+s1}{\PYZsq{}}
\PY{n}{plotPairs}\PY{p}{(}\PY{n}{irisdata\PYZus{}df\PYZus{}orig}\PY{p}{,} \PY{n}{title}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{figure}
        \begin{center}\adjustimage{max size={0.9\linewidth}{0.9\paperheight}}{Step-by-step_intro_to_ML_with_SVC_and_Iris_files/Step-by-step_intro_to_ML_with_SVC_and_Iris_304_0.png}\end{center}
        \caption{Pairs plot of the original Iris dataset}
        \label{fig:pairs_plot_orig_Iris}
    \end{figure}
    
    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{132}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{title} \PY{o}{=} \PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Pairs plot of the NOISED Iris dataset}\PY{l+s+s1}{\PYZsq{}}
\PY{n}{plotPairs}\PY{p}{(}\PY{n}{irisdata\PYZus{}df\PYZus{}noised}\PY{p}{,} \PY{n}{title}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{figure}
        \begin{center}\adjustimage{max size={0.9\linewidth}{0.9\paperheight}}{Step-by-step_intro_to_ML_with_SVC_and_Iris_files/Step-by-step_intro_to_ML_with_SVC_and_Iris_305_0.png}\end{center}
        \caption{Pairs plot of the noised Iris dataset}
        \label{fig:pairs_plot_noised_Iris}
    \end{figure}
    
    Finally, the noisy Iris dataset is saved in its own CSV file.

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{133}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} Save noised Iris dataset to CSV file without index}
\PY{n}{csv\PYZus{}filepath} \PY{o}{=} \PY{l+s+sa}{r}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{./datasets/IRIS\PYZus{}flower\PYZus{}dataset\PYZus{}kaggle\PYZus{}noised.csv}\PY{l+s+s1}{\PYZsq{}}
\PY{n}{irisdata\PYZus{}df\PYZus{}noised}\PY{o}{.}\PY{n}{to\PYZus{}csv}\PY{p}{(}\PY{n}{csv\PYZus{}filepath}\PY{p}{,} \PY{n}{sep}\PY{o}{=}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{,}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{index}\PY{o}{=}\PY{k+kc}{False}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \hypertarget{grid-search}{%
\subsection{Grid search}\label{grid-search}}

    Initialize the SVC model and define the \textbf{space of the
hyperparameters} to perform the \textbf{grid search} over:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{148}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{classifier} \PY{o}{=} \PY{n}{svm}\PY{o}{.}\PY{n}{SVC}\PY{p}{(}\PY{p}{)}

\PY{c+c1}{\PYZsh{}kernels = [\PYZdq{}linear\PYZdq{}, \PYZdq{}rbf\PYZdq{}, \PYZdq{}sigmoid\PYZdq{}, \PYZdq{}poly\PYZdq{}]}
\PY{n}{kernels} \PY{o}{=} \PY{p}{[}\PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{rbf}\PY{l+s+s2}{\PYZdq{}}\PY{p}{,} \PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{poly}\PY{l+s+s2}{\PYZdq{}}\PY{p}{]}
\PY{n}{gammas} \PY{o}{=} \PY{p}{[}\PY{l+m+mf}{0.1}\PY{p}{,} \PY{l+m+mi}{1}\PY{p}{,} \PY{l+m+mi}{10}\PY{p}{,} \PY{l+m+mi}{100}\PY{p}{,} \PY{l+m+mi}{200}\PY{p}{]}
\PY{n}{cs} \PY{o}{=} \PY{p}{[}\PY{l+m+mf}{0.1}\PY{p}{,} \PY{l+m+mi}{1}\PY{p}{,} \PY{l+m+mi}{5}\PY{p}{,} \PY{l+m+mi}{10}\PY{p}{,} \PY{l+m+mi}{100}\PY{p}{,} \PY{l+m+mi}{1000}\PY{p}{]}

\PY{c+c1}{\PYZsh{} reduce the possible polynomial degrees to reasonable values,}
\PY{c+c1}{\PYZsh{} since with higher degrees the calculation time increases exponentially}
\PY{c+c1}{\PYZsh{}degrees = [1, 2, 3, 4, 5]}
\PY{n}{degrees} \PY{o}{=} \PY{p}{[}\PY{l+m+mi}{1}\PY{p}{,} \PY{l+m+mi}{2}\PY{p}{,} \PY{l+m+mi}{3}\PY{p}{]}

\PY{n}{grid} \PY{o}{=} \PY{n+nb}{dict}\PY{p}{(}\PY{n}{kernel}\PY{o}{=}\PY{n}{kernels}\PY{p}{,} \PY{n}{gamma}\PY{o}{=}\PY{n}{gammas}\PY{p}{,} \PY{n}{C}\PY{o}{=}\PY{n}{cs}\PY{p}{,} \PY{n}{degree}\PY{o}{=}\PY{n}{degrees}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    Initialize a \textbf{cross-validation fold} and \textbf{perform a grid
search} to tune the hyperparameters:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{149}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{cvFold} \PY{o}{=} \PY{n}{RepeatedKFold}\PY{p}{(}\PY{n}{n\PYZus{}splits}\PY{o}{=}\PY{l+m+mi}{10}\PY{p}{,} \PY{n}{n\PYZus{}repeats}\PY{o}{=}\PY{l+m+mi}{3}\PY{p}{,} \PY{n}{random\PYZus{}state}\PY{o}{=}\PY{l+m+mi}{1}\PY{p}{)}

\PY{n}{gridSearch} \PY{o}{=} \PY{n}{GridSearchCV}\PY{p}{(}\PY{n}{estimator}\PY{o}{=}\PY{n}{classifier}\PY{p}{,} \PY{n}{param\PYZus{}grid}\PY{o}{=}\PY{n}{grid}\PY{p}{,} \PY{n}{n\PYZus{}jobs}\PY{o}{=}\PY{o}{\PYZhy{}}\PY{l+m+mi}{1}\PY{p}{,}
                          \PY{n}{cv}\PY{o}{=}\PY{n}{cvFold}\PY{p}{,} \PY{n}{scoring}\PY{o}{=}\PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{accuracy}\PY{l+s+s2}{\PYZdq{}}\PY{p}{)}

\PY{c+c1}{\PYZsh{} initiate measuring execution time}
\PY{n}{execTime} \PY{o}{=} \PY{n}{MeasExecTimeOfProgram}\PY{p}{(}\PY{p}{)}
\PY{n}{execTime}\PY{o}{.}\PY{n}{start}\PY{p}{(}\PY{p}{)}

\PY{n}{searchResults} \PY{o}{=} \PY{n}{gridSearch}\PY{o}{.}\PY{n}{fit}\PY{p}{(}\PY{n}{X\PYZus{}train}\PY{p}{,} \PY{n}{y\PYZus{}train}\PY{p}{)}

\PY{c+c1}{\PYZsh{} Print execution time delta}
\PY{n}{execTime\PYZus{}sec} \PY{o}{=} \PY{n}{execTime}\PY{o}{.}\PY{n}{stop}\PY{p}{(}\PY{p}{)}\PY{o}{/}\PY{l+m+mi}{1000}
\PY{n}{execTime\PYZus{}str} \PY{o}{=} \PY{n}{time}\PY{o}{.}\PY{n}{strftime}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{\PYZpc{}}\PY{l+s+s1}{H h, }\PY{l+s+s1}{\PYZpc{}}\PY{l+s+s1}{M min, }\PY{l+s+s1}{\PYZpc{}}\PY{l+s+s1}{S sec}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{time}\PY{o}{.}\PY{n}{gmtime}\PY{p}{(}\PY{n}{execTime\PYZus{}sec}\PY{p}{)}\PY{p}{)}
\PY{c+c1}{\PYZsh{}print(\PYZsq{}Execution time: \PYZob{}:.3f\PYZcb{} s\PYZsq{}.format(execTime\PYZus{}sec))}
\PY{n+nb}{print}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Execution time: }\PY{l+s+si}{\PYZob{}\PYZcb{}}\PY{l+s+s1}{\PYZsq{}}\PY{o}{.}\PY{n}{format}\PY{p}{(}\PY{n}{execTime\PYZus{}str}\PY{p}{)}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{Verbatim}[commandchars=\\\{\}]
Execution time: 00 h, 24 min, 35 sec
    \end{Verbatim}

    Extract the best model and evaluate it:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{150}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} predict labels by best model}
\PY{n}{bestModel} \PY{o}{=} \PY{n}{searchResults}\PY{o}{.}\PY{n}{best\PYZus{}estimator\PYZus{}}

\PY{n}{y\PYZus{}pred} \PY{o}{=} \PY{n}{bestModel}\PY{o}{.}\PY{n}{predict}\PY{p}{(}\PY{n}{X\PYZus{}test}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{158}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} calculate cross validation score from the best model}
\PY{c+c1}{\PYZsh{} HINT: do NOT use the accuracy score \PYZhy{} it\PYZsq{}s to inaccurate!}
\PY{n}{accuracies} \PY{o}{=} \PY{n}{cross\PYZus{}val\PYZus{}score}\PY{p}{(}\PY{n}{estimator} \PY{o}{=} \PY{n}{bestModel}\PY{p}{,} \PY{n}{X} \PY{o}{=} \PY{n}{X\PYZus{}train}\PY{p}{,} 
                             \PY{n}{y} \PY{o}{=} \PY{n}{y\PYZus{}train}\PY{p}{,} \PY{n}{cv} \PY{o}{=} \PY{l+m+mi}{20}\PY{p}{)}

\PY{n+nb}{print}\PY{p}{(}\PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{Cross\PYZhy{}validation score: }\PY{l+s+si}{\PYZob{}:.2f\PYZcb{}}\PY{l+s+s2}{ }\PY{l+s+s2}{\PYZpc{}}\PY{l+s+s2}{\PYZdq{}}\PY{o}{.}\PY{n}{format}\PY{p}{(}\PY{n}{accuracies}\PY{o}{.}\PY{n}{mean}\PY{p}{(}\PY{p}{)}\PY{o}{*}\PY{l+m+mi}{100}\PY{p}{)}\PY{p}{)}
\PY{n+nb}{print}\PY{p}{(}\PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{Standard Deviation: }\PY{l+s+si}{\PYZob{}:.2f\PYZcb{}}\PY{l+s+s2}{ }\PY{l+s+s2}{\PYZpc{}}\PY{l+s+s2}{\PYZdq{}}\PY{o}{.}\PY{n}{format}\PY{p}{(}\PY{n}{accuracies}\PY{o}{.}\PY{n}{std}\PY{p}{(}\PY{p}{)}\PY{o}{*}\PY{l+m+mi}{100}\PY{p}{)}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{Verbatim}[commandchars=\\\{\}]
Cross-validation score: 96.67 \%
Standard Deviation: 6.67 \%
    \end{Verbatim}

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{159}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{k+kn}{from} \PY{n+nn}{sklearn}\PY{n+nn}{.}\PY{n+nn}{metrics} \PY{k+kn}{import} \PY{n}{classification\PYZus{}report}

\PY{n+nb}{print}\PY{p}{(}\PY{n}{classification\PYZus{}report}\PY{p}{(}\PY{n}{y\PYZus{}test}\PY{p}{,} \PY{n}{y\PYZus{}pred}\PY{p}{)}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{Verbatim}[commandchars=\\\{\}]
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00         9
Iris-versicolor       0.92      1.00      0.96        11
 Iris-virginica       1.00      0.90      0.95        10

       accuracy                           0.97        30
      macro avg       0.97      0.97      0.97        30
   weighted avg       0.97      0.97      0.97        30

    \end{Verbatim}

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{160}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{sns}\PY{o}{.}\PY{n}{set\PYZus{}style}\PY{p}{(}\PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{white}\PY{l+s+s2}{\PYZdq{}}\PY{p}{)}

\PY{c+c1}{\PYZsh{} print colored confusion matrix}
\PY{n}{cm\PYZus{}colored} \PY{o}{=} \PY{n}{metrics}\PY{o}{.}\PY{n}{ConfusionMatrixDisplay}\PY{o}{.}\PY{n}{from\PYZus{}predictions}\PY{p}{(}\PY{n}{y\PYZus{}test}\PY{p}{,} \PY{n}{y\PYZus{}pred}\PY{p}{)}

\PY{n}{cm\PYZus{}colored}\PY{o}{.}\PY{n}{figure\PYZus{}}\PY{o}{.}\PY{n}{set\PYZus{}figwidth}\PY{p}{(}\PY{l+m+mi}{8}\PY{p}{)}
\PY{n}{cm\PYZus{}colored}\PY{o}{.}\PY{n}{figure\PYZus{}}\PY{o}{.}\PY{n}{set\PYZus{}figheight}\PY{p}{(}\PY{l+m+mi}{7}\PY{p}{)}

\PY{n}{cm\PYZus{}colored}\PY{o}{.}\PY{n}{confusion\PYZus{}matrix}

\PY{c+c1}{\PYZsh{} y .. padding between title and plot}
\PY{n}{plt}\PY{o}{.}\PY{n}{title}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Colored Confusion Matrix}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{y}\PY{o}{=}\PY{l+m+mf}{1.1}\PY{p}{)}

\PY{n}{plt}\PY{o}{.}\PY{n}{tight\PYZus{}layout}\PY{p}{(}\PY{p}{)}
\PY{n}{plt}\PY{o}{.}\PY{n}{show}\PY{p}{(}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{figure}
        \begin{center}\includegraphics[scale=0.6]{Step-by-step_intro_to_ML_with_SVC_and_Iris_files/Step-by-step_intro_to_ML_with_SVC_and_Iris_317_0.png}\end{center}
        \caption{Confusion matrix for cross-validation after the grid search has been performed}
        \label{fig:cm_grid_search}
    \end{figure}
    
    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{154}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{bestModel}\PY{o}{.}\PY{n}{get\PYZus{}params}\PY{p}{(}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

            \begin{tcolorbox}[breakable, size=fbox, boxrule=.5pt, pad at break*=1mm, opacityfill=0]
\prompt{Out}{outcolor}{154}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\{'C': 1,
 'break\_ties': False,
 'cache\_size': 200,
 'class\_weight': None,
 'coef0': 0.0,
 'decision\_function\_shape': 'ovr',
 'degree': 1,
 'gamma': 0.1,
 'kernel': 'rbf',
 'max\_iter': -1,
 'probability': False,
 'random\_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False\}
\end{Verbatim}
\end{tcolorbox}
        
    \hypertarget{randomized-search}{%
\subsection{Randomized search}\label{randomized-search}}

    Initialize the SVC model and define the \textbf{space of the
hyperparameters} to perform the \textbf{randomized search} over:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{189}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{classifier} \PY{o}{=} \PY{n}{svm}\PY{o}{.}\PY{n}{SVC}\PY{p}{(}\PY{p}{)}

\PY{c+c1}{\PYZsh{}kernels = [\PYZdq{}linear\PYZdq{}, \PYZdq{}rbf\PYZdq{}, \PYZdq{}sigmoid\PYZdq{}, \PYZdq{}poly\PYZdq{}]}
\PY{n}{kernels} \PY{o}{=} \PY{p}{[}\PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{rbf}\PY{l+s+s2}{\PYZdq{}}\PY{p}{,} \PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{poly}\PY{l+s+s2}{\PYZdq{}}\PY{p}{]}
\PY{n}{gammas} \PY{o}{=} \PY{p}{[}\PY{l+m+mf}{0.1}\PY{p}{,} \PY{l+m+mi}{1}\PY{p}{,} \PY{l+m+mi}{10}\PY{p}{,} \PY{l+m+mi}{100}\PY{p}{,} \PY{l+m+mi}{200}\PY{p}{]}
\PY{n}{cs} \PY{o}{=} \PY{p}{[}\PY{l+m+mf}{0.1}\PY{p}{,} \PY{l+m+mi}{1}\PY{p}{,} \PY{l+m+mi}{5}\PY{p}{,} \PY{l+m+mi}{10}\PY{p}{,} \PY{l+m+mi}{100}\PY{p}{,} \PY{l+m+mi}{1000}\PY{p}{]}

\PY{c+c1}{\PYZsh{} reduce the possible polynomial degrees to reasonable values,}
\PY{c+c1}{\PYZsh{} since with higher degrees the calculation time increases exponentially}
\PY{c+c1}{\PYZsh{}degrees = [1, 2, 3, 4, 5]}
\PY{n}{degrees} \PY{o}{=} \PY{p}{[}\PY{l+m+mi}{1}\PY{p}{,} \PY{l+m+mi}{2}\PY{p}{,} \PY{l+m+mi}{3}\PY{p}{]}

\PY{n}{grid} \PY{o}{=} \PY{n+nb}{dict}\PY{p}{(}\PY{n}{kernel}\PY{o}{=}\PY{n}{kernels}\PY{p}{,} \PY{n}{gamma}\PY{o}{=}\PY{n}{gammas}\PY{p}{,} \PY{n}{C}\PY{o}{=}\PY{n}{cs}\PY{p}{,} \PY{n}{degree}\PY{o}{=}\PY{n}{degrees}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    Initialize a \textbf{cross-validation fold} and \textbf{perform a
randomized search} to tune the hyperparameters:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{190}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{cvFold} \PY{o}{=} \PY{n}{RepeatedKFold}\PY{p}{(}\PY{n}{n\PYZus{}splits}\PY{o}{=}\PY{l+m+mi}{10}\PY{p}{,} \PY{n}{n\PYZus{}repeats}\PY{o}{=}\PY{l+m+mi}{3}\PY{p}{,} \PY{n}{random\PYZus{}state}\PY{o}{=}\PY{l+m+mi}{1}\PY{p}{)}

\PY{n}{randomSearch} \PY{o}{=} \PY{n}{RandomizedSearchCV}\PY{p}{(}\PY{n}{estimator}\PY{o}{=}\PY{n}{classifier}\PY{p}{,} \PY{n}{n\PYZus{}jobs}\PY{o}{=}\PY{o}{\PYZhy{}}\PY{l+m+mi}{1}\PY{p}{,}
                                  \PY{n}{cv}\PY{o}{=}\PY{n}{cvFold}\PY{p}{,} \PY{n}{param\PYZus{}distributions}\PY{o}{=}\PY{n}{grid}\PY{p}{,}
                                  \PY{n}{scoring}\PY{o}{=}\PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{accuracy}\PY{l+s+s2}{\PYZdq{}}\PY{p}{)}

\PY{c+c1}{\PYZsh{} initiate measuring execution time}
\PY{n}{execTime} \PY{o}{=} \PY{n}{MeasExecTimeOfProgram}\PY{p}{(}\PY{p}{)}
\PY{n}{execTime}\PY{o}{.}\PY{n}{start}\PY{p}{(}\PY{p}{)}

\PY{n}{searchResults} \PY{o}{=} \PY{n}{randomSearch}\PY{o}{.}\PY{n}{fit}\PY{p}{(}\PY{n}{X\PYZus{}train}\PY{p}{,} \PY{n}{y\PYZus{}train}\PY{p}{)}

\PY{c+c1}{\PYZsh{} Print execution time delta}
\PY{n}{execTime\PYZus{}sec} \PY{o}{=} \PY{n}{execTime}\PY{o}{.}\PY{n}{stop}\PY{p}{(}\PY{p}{)}\PY{o}{/}\PY{l+m+mi}{1000}
\PY{n}{execTime\PYZus{}str} \PY{o}{=} \PY{n}{time}\PY{o}{.}\PY{n}{strftime}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{\PYZpc{}}\PY{l+s+s1}{H h, }\PY{l+s+s1}{\PYZpc{}}\PY{l+s+s1}{M min, }\PY{l+s+s1}{\PYZpc{}}\PY{l+s+s1}{S sec}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{time}\PY{o}{.}\PY{n}{gmtime}\PY{p}{(}\PY{n}{execTime\PYZus{}sec}\PY{p}{)}\PY{p}{)}
\PY{c+c1}{\PYZsh{}print(\PYZsq{}Execution time: \PYZob{}:.3f\PYZcb{} s\PYZsq{}.format(execTime\PYZus{}sec))}
\PY{n+nb}{print}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Execution time: }\PY{l+s+si}{\PYZob{}\PYZcb{}}\PY{l+s+s1}{\PYZsq{}}\PY{o}{.}\PY{n}{format}\PY{p}{(}\PY{n}{execTime\PYZus{}str}\PY{p}{)}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{Verbatim}[commandchars=\\\{\}]
Execution time: 00 h, 00 min, 00 sec
    \end{Verbatim}

    Extract the best model and evaluate it:

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{191}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} predict labels by best model}
\PY{n}{bestModel} \PY{o}{=} \PY{n}{searchResults}\PY{o}{.}\PY{n}{best\PYZus{}estimator\PYZus{}}

\PY{n}{y\PYZus{}pred} \PY{o}{=} \PY{n}{bestModel}\PY{o}{.}\PY{n}{predict}\PY{p}{(}\PY{n}{X\PYZus{}test}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{192}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{c+c1}{\PYZsh{} calculate cross validation score from the best model}
\PY{c+c1}{\PYZsh{} HINT: do NOT use the accuracy score \PYZhy{} it\PYZsq{}s to inaccurate!}
\PY{n}{accuracies} \PY{o}{=} \PY{n}{cross\PYZus{}val\PYZus{}score}\PY{p}{(}\PY{n}{estimator} \PY{o}{=} \PY{n}{bestModel}\PY{p}{,} \PY{n}{X} \PY{o}{=} \PY{n}{X\PYZus{}train}\PY{p}{,} 
                             \PY{n}{y} \PY{o}{=} \PY{n}{y\PYZus{}train}\PY{p}{,} \PY{n}{cv} \PY{o}{=} \PY{l+m+mi}{10}\PY{p}{)}

\PY{n+nb}{print}\PY{p}{(}\PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{Cross\PYZhy{}validation score: }\PY{l+s+si}{\PYZob{}:.2f\PYZcb{}}\PY{l+s+s2}{ }\PY{l+s+s2}{\PYZpc{}}\PY{l+s+s2}{\PYZdq{}}\PY{o}{.}\PY{n}{format}\PY{p}{(}\PY{n}{accuracies}\PY{o}{.}\PY{n}{mean}\PY{p}{(}\PY{p}{)}\PY{o}{*}\PY{l+m+mi}{100}\PY{p}{)}\PY{p}{)}
\PY{n+nb}{print}\PY{p}{(}\PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{Standard Deviation: }\PY{l+s+si}{\PYZob{}:.2f\PYZcb{}}\PY{l+s+s2}{ }\PY{l+s+s2}{\PYZpc{}}\PY{l+s+s2}{\PYZdq{}}\PY{o}{.}\PY{n}{format}\PY{p}{(}\PY{n}{accuracies}\PY{o}{.}\PY{n}{std}\PY{p}{(}\PY{p}{)}\PY{o}{*}\PY{l+m+mi}{100}\PY{p}{)}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{Verbatim}[commandchars=\\\{\}]
Cross-validation score: 95.00 \%
Standard Deviation: 6.67 \%
    \end{Verbatim}

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{193}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{k+kn}{from} \PY{n+nn}{sklearn}\PY{n+nn}{.}\PY{n+nn}{metrics} \PY{k+kn}{import} \PY{n}{classification\PYZus{}report}

\PY{n+nb}{print}\PY{p}{(}\PY{n}{classification\PYZus{}report}\PY{p}{(}\PY{n}{y\PYZus{}test}\PY{p}{,} \PY{n}{y\PYZus{}pred}\PY{p}{)}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{Verbatim}[commandchars=\\\{\}]
                 precision    recall  f1-score   support

    Iris-setosa       1.00      1.00      1.00         9
Iris-versicolor       1.00      1.00      1.00        11
 Iris-virginica       1.00      1.00      1.00        10

       accuracy                           1.00        30
      macro avg       1.00      1.00      1.00        30
   weighted avg       1.00      1.00      1.00        30

    \end{Verbatim}

    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{194}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{sns}\PY{o}{.}\PY{n}{set\PYZus{}style}\PY{p}{(}\PY{l+s+s2}{\PYZdq{}}\PY{l+s+s2}{white}\PY{l+s+s2}{\PYZdq{}}\PY{p}{)}

\PY{c+c1}{\PYZsh{} print colored confusion matrix}
\PY{n}{cm\PYZus{}colored} \PY{o}{=} \PY{n}{metrics}\PY{o}{.}\PY{n}{ConfusionMatrixDisplay}\PY{o}{.}\PY{n}{from\PYZus{}predictions}\PY{p}{(}\PY{n}{y\PYZus{}test}\PY{p}{,} \PY{n}{y\PYZus{}pred}\PY{p}{)}

\PY{n}{cm\PYZus{}colored}\PY{o}{.}\PY{n}{figure\PYZus{}}\PY{o}{.}\PY{n}{set\PYZus{}figwidth}\PY{p}{(}\PY{l+m+mi}{8}\PY{p}{)}
\PY{n}{cm\PYZus{}colored}\PY{o}{.}\PY{n}{figure\PYZus{}}\PY{o}{.}\PY{n}{set\PYZus{}figheight}\PY{p}{(}\PY{l+m+mi}{7}\PY{p}{)}

\PY{n}{cm\PYZus{}colored}\PY{o}{.}\PY{n}{confusion\PYZus{}matrix}

\PY{c+c1}{\PYZsh{} y .. padding between title and plot}
\PY{n}{plt}\PY{o}{.}\PY{n}{title}\PY{p}{(}\PY{l+s+s1}{\PYZsq{}}\PY{l+s+s1}{Colored Confusion Matrix}\PY{l+s+s1}{\PYZsq{}}\PY{p}{,} \PY{n}{y}\PY{o}{=}\PY{l+m+mf}{1.1}\PY{p}{)}

\PY{n}{plt}\PY{o}{.}\PY{n}{tight\PYZus{}layout}\PY{p}{(}\PY{p}{)}
\PY{n}{plt}\PY{o}{.}\PY{n}{show}\PY{p}{(}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

    \begin{figure}
        \begin{center}\includegraphics[scale=0.6]{Step-by-step_intro_to_ML_with_SVC_and_Iris_files/Step-by-step_intro_to_ML_with_SVC_and_Iris_328_0.png}\end{center}
        \caption{Confusion matrix for cross-validation after the randomized search has been performed}
        \label{fig:cm_random_search}
    \end{figure}
    
    \begin{tcolorbox}[breakable, size=fbox, boxrule=1pt, pad at break*=1mm,colback=cellbackground, colframe=cellborder]
\prompt{In}{incolor}{195}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\PY{n}{bestModel}\PY{o}{.}\PY{n}{get\PYZus{}params}\PY{p}{(}\PY{p}{)}
\end{Verbatim}
\end{tcolorbox}

            \begin{tcolorbox}[breakable, size=fbox, boxrule=.5pt, pad at break*=1mm, opacityfill=0]
\prompt{Out}{outcolor}{195}{\boxspacing}
\begin{Verbatim}[commandchars=\\\{\}]
\{'C': 10,
 'break\_ties': False,
 'cache\_size': 200,
 'class\_weight': None,
 'coef0': 0.0,
 'decision\_function\_shape': 'ovr',
 'degree': 1,
 'gamma': 0.1,
 'kernel': 'rbf',
 'max\_iter': -1,
 'probability': False,
 'random\_state': None,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False\}
\end{Verbatim}
\end{tcolorbox}
        
    \hypertarget{summary-and-outlook}{%
\section{Summary and outlook}\label{summary-and-outlook}}

\hypertarget{english-summary}{%
\subsection{English summary}\label{english-summary}}

In November 2022, the \textbf{Artificial Intelligence Conference} took
place in Dresden, which was hosted by the German Social Accident
Insurance (DGUV). There, the current tutorial was presented to
interested ML newcomers in the technical occupational safety and health
of the social accident insurance institutions as part of a separate
\textbf{Getting Started Workshop}.

In the \textbf{tutorial}, the \textbf{typical workflow} in
\textbf{machine learning (ML)} was demonstrated systematically and
step-by-step using the very familiar \textbf{Iris dataset}. The reasons
for choosing a ready-made dataset are that an ML novice could first
become familiar with ML algorithms, data analysis tools and software
libraries as well as programming systems. The task was to distinguish,
i.e.~classify, three different Iris species based on the
\textbf{dimensions} (width and length) of their \textbf{petals and
sepals}. The dataset contains \textbf{50 measured individuals per
species}.

For the classification of the dataset the very powerful \textbf{Support
Vector Classifier (SVC)} was used. Although there is a very rich
selection of other powerful ML algorithms suitable for the
classification task at hand here, the SVC algorithm was deliberately
chosen for the target group of the workshop for a comprehensible
introduction. Its working principle is easily understandable for ML
newcomers as well as in the time frame given for the workshop.

The main sections of the tutorial represent the individual steps in a
typical ML workflow. In \textbf{Step 0}, specific guidance was given on
the selection of hardware and software suitable for machine learning.
After introductory notes regarding \textbf{community support} to be
considered, a distinction was made between training and application
systems in the explanations of \textbf{hardware selection}. Regarding
the \textbf{software selection}, the programming languages and
programming environments (so-called IDEs) popular with ML developers
were introduced and advantages and disadvantages were mentioned in each
case. In addition, cloud-hosted IDEs were shown, which eliminate the
need to purchase suitable hardware and install the necessary software
packages to a certain extent.

As stated above, in \textbf{Step 1} the ready-made and very
beginner-friendly \textbf{Iris dataset} was imported into Python, citing
the sources of acquisition. One of the most important and extensive
steps in the entire ML process was \textbf{Step 2}. In this, the dataset
imported in Step 1 was explored using typical data analysis tools. In
addition to \textbf{exploring} the \textbf{data structure} as well as
the \textbf{inner correlations} in the dataset, \textbf{errors} such as
gaps, duplications, or obvious misentries had to be found and corrected
if possible. This was enormously important so that the classification
could later provide plausible results. Since the Iris dataset had no
gaps or duplications, another alternative dataset was used to
demonstrate the tools.

In \textbf{Step 3}, a very brief introduction to the world of artificial
intelligence and machine learning was given. The introduction was
supported by a \textbf{taxonomy of different types of learning} and the
listing of selected ML algorithms. A \textbf{decision graph} was used to
justify the choice of the Support Vector Classifier (SVC) for the
classification task at hand. Afterwards, the basic working principle of
the SVC including the so-called \textbf{kernel trick} was explained.
Finally, a corresponding SVC model was implemented.

In \textbf{Step 4} the dataset was preprocessed for the actual
classification by SVC. Depending on the selected ML algorithm as well as
the data structure, it could be necessary to prepare the data before
training, e.g., by \textbf{standardization} or \textbf{normalization}.
For the Iris dataset used, standardization was sufficient to align the
value ranges of the features.

After splitting the dataset into a \textbf{training and test dataset},
the SVC model was trained with the training dataset in \textbf{step 5}.
Subsequently, \textbf{classification predictions} were made with the
trained SVC model using the test data.

In \textbf{Step 6}, the quality of the classification result was
evaluated using known \textbf{metrics} such as the \textbf{cross
validation score} and the \textbf{confusion matrix}.

The classification in step 5 was initially carried out with default
values for the so-called \textbf{hyper parameters} of the SVC.
Therefore, in \textbf{step 7} the meaning of the different parameters
was explained. Then, their influence on the classification result was
demonstrated by \textbf{manually varying} the individual
hyper-parameters.

In the final \textbf{Step 8}, two approaches to systematic
hyper-parameter search were presented: \textbf{grid search} and
\textbf{randomized search}. While the former considered exhaustively all
parameter combinations for given values, the latter approach selected a
number of candidates from a parameter space with a particular random
distribution. In order to be able to evaluate whether the two search
methods led to improved results, classification was first carried out
using default values for the hyper-parameters. The so-called
\textbf{baseline} was determined. It turned out that the Iris dataset
already gave much too good results at default values (recognition rates
around 100\%) to expect significant improvements by systematic parameter
search. Therefore, \textbf{Gaussian noise} was added to the dataset in a
second step, which allowed to simulate \textbf{measurement noise} of
real ML applications at the same time.

Using a custom Python class, the \textbf{run times} for the two search
methods were determined. As expected, the grid search took by far the
longest time compared to the randomized search. However, only local
optima for the hyper-parameters were always identified for the
randomized search, which also led to different results for each run. The
quality of the classification in terms of recognition rate was again
determined using the metrics presented in step 6. While the grid search
always produced better results compared to the baseline, these varied
for the randomized search and were each slightly better or worse than
those of the baseline.

As an outlook, the current tutorial could be extended by replacing the
Iris dataset with the much more modern
\textbf{\href{https://github.com/mcnakhaee/palmerpenguins}{Penguin
dataset}} (original package for \textbf{R}: \cite{palmerpenguins_R_2020}
as well as adapted package for \textbf{Python}:
\cite{palmerpenguins_Python_2020}). However, this would mean a
fundamental reworking of all the steps in this tutorial.

As explained above, the tutorial so far is limited to the introduction
of the ML algorithms, the tools for data analysis, and the Python
libraries and programming systems. In a further step, concrete hints
could be given on how to build an own \textbf{real ML application} and
how to generate a suitable dataset for classification.

In the future, the \textbf{systematic process steps} of a typical ML
workflow explained in the tutorial could form the \textbf{foundation}
for possible \textbf{test guidelines}. These could be used to perform
\textbf{reviews of industrial ML applications} - similar to the
evaluation of the functional safety of control software.

    \hypertarget{german-summary}{%
\subsection{German summary}\label{german-summary}}

Im November 2022 fand die \textbf{Fachtagung ``Künstliche Intelligenz''}
in Dresden statt, welche durch die Deutsche Gesetzliche
Unfallversicherung (DGUV) ausgerichtet wurde. Dort wurde im Rahmen eines
eigenen \textbf{Getting-Started-Workshops} das vorliegende Tutorial
interessierten ML-Neulingen im technischen Arbeitsschutz der
gesetzlichen Unfallversicherungsträger präsentiert.

Im \textbf{Tutorial} wurde systematisch und Schritt-für-Schritt der
\textbf{typische Arbeitsablauf} beim \textbf{maschinellen Lernen (ML)}
anhand des sehr bekannten \textbf{Iris-Datensatzes} demonstriert. Es
wurde deshalb auf einen fertigen Datensatz zurückgegriffen, damit sich
ein ML-Neuling zunächst mit den ML-Algorithmen, den Werkzeugen zur
Datenanalyse sowie den Software-Bibliotheken und Programmiersystemen
vertraut machen kann. Die Aufgabe bestand darin, drei verschiedene
Iris-Arten anhand der \textbf{Abmessungen} (Breite und Länge) ihrer
\textbf{Kron- und Kelchblätter} zu unterscheiden, d.~h. zu
klassifizieren. Der Datensatz enthält jeweils \textbf{50 vermessene
Individuen pro Art}.

Zur Klassifikation des Datensatzes kam der sehr leistungsfähige
\textbf{Support Vector Classifier (SVC)} zum Einsatz. Obwohl es eine
sehr reichhaltige Auswahl anderer leistungsfähiger und für die hier
vorliegende Klassifikationsaufgabe geeigneter ML-Algorithmen gibt, wurde
für einen verständlichen Einstieg bewusst der SVC-Algorithmus für die
Zielgruppe des Workshops gewählt. Dessen Arbeitsweise ist sowohl für
ML-Neulinge als auch in dem für den Workshop vorgegebenen Zeitrahmen
leicht vermittelbar.

Die Hauptabschnitte des Tutorials repräsentieren die einzelnen Schritte
in einem typischen ML-Arbeitsablauf. Im \textbf{Schritt 0} wurden
konkrete Hinweise für die Auswahl der für das maschinelle Lernen
geeigneten Hardware und Software gegeben. Nach einführenden Hinweisen
hinsichtlich des zu berücksichtigenden \textbf{Community Supports} wurde
bei den Ausführungen zur \textbf{Hardware-Auswahl} zwischen Trainings-
und Applikationssystem unterschieden. Hinsichtlich der
\textbf{Software-Auswahl} wurden die bei ML-Entwicklern beliebten
Programmiersprachen und Programmierumgebungen (sog. IDEs) vorgestellt
und jeweils Vor- und Nachteile genannt. Darüber hinaus wurden
Cloud-gehostete IDEs gezeigt, die die Anschaffung geeigneter Hardware
und die Installation der notwendigen Software-Pakete bis zu einem
gewissen Grad überflüssig machen.

Wie oben begründet, wurde im \textbf{Schritt 1} der fertige und sehr
einsteigerfreundliche \textbf{Iris-Datensatz} in Python importiert unter
Nennung der Bezugsquellen. Mit der wichtigste und umfangreichste Schritt
im gesamten ML-Prozess ist \textbf{Schritt 2}. In diesem wurde der in
Schritt 1 importierte Datensatz mit Hilfe typischer
Datenanalyse-Werkzeuge untersucht. Neben der \textbf{Erkundung} der
\textbf{Datenstruktur} sowie der \textbf{inneren Zusammenhänge (sog.
Korrelationen)} im Datensatz mussten auch \textbf{Fehler} wie z. B.
Lücken, Dopplungen oder offensichtliche Fehleingaben gefunden und nach
Möglichkeit behoben werden. Dies war enorm wichtig, damit die
Klassifikation später plausible Ergebnisse liefern konnte. Da der
Iris-Datensatz keine Lücken und Dopplungen aufwies, wurde zur
Demonstration der Werkzeuge auf einen anderen alternativen Datensatz
ausgewichen.

Im \textbf{Schritt 3} wurde zunächst eine sehr kurze Einführung in die
Welt der künstlichen Intelligenz und des maschinellen Lernens gegeben.
Unterstützt wurde die Einführung durch eine \textbf{Taxonomie der
verschiedenen Lernarten} und der Nennung ausgewählter ML-Algorithmen.
Anhand eines \textbf{Entscheidungsgraphes} wurde die Wahl des Support
Vector Classifiers (SVC) für die vorliegende Klassifikationsaufgabe
begründet. Danach wurde das grundsätzliche Funktionsprinzip des SVC
einschließlich des sog. \textbf{Kernel-Tricks} erläutert. Abschließend
wurde ein entsprechendes SVC-Modell implementiert.

Im \textbf{Schritt 4} wurde der Datensatz für die eigentliche
Klassifikation per SVC vorbereitet. Je nach gewähltem ML-Algorithmus
sowie der Datenstruktur konnte es erforderlich sein, dass die Daten vor
dem Training aufbereitet werden mussten, z. B. durch
\textbf{Standardisierung} oder \textbf{Normalisierung}. Für den
verwendeten Iris-Datensatz genügte eine Standardisierung, um die
Wertebereiche der Features aneinander anzugleichen.

Nach der Aufteilung des Datensatzes in einen \textbf{Trainings- und
Testdatensatz}, wurde das SVC-Modell im \textbf{Schritt 5} mit dem
Trainingsdatensatz trainiert. Anschließend wurden mit dem trainierten
SVC-Modell anhand der Testdaten \textbf{Klassifikationsvorhersagen}
getroffen.

Im \textbf{Schritt 6} wurde die Güte des Klassifikationsergebnisses
unter Verwendung bekannter \textbf{Metriken} wie z. B. dem
\textbf{Kreuzvalidierungskennwert (eng. Cross Validation Score)} und der
\textbf{Konfusionsmatrix} evaluiert.

Die Klassifikation im Schritt 5 wurde zunächst mit Standardwerten für
die sogenannten \textbf{Hyper-Parameter} des SVC durchgeführt. Daher
wurde im \textbf{Schritt 7} die Bedeutung der verschiedenen Parameter
erklärt. Danach wurde ihr Einfluss auf das Klassifikationsergebnis durch
\textbf{manuelle Variation} der einzelnen Hyper-Parameter demonstriert.

Im abschließenden \textbf{Schritt 8} wurden zwei Ansätze zur
systematischen Hyper-Parameter-Suche vorgestellt: \textbf{Rastersuche
(eng. Grid Search)} und \textbf{Zufallssuche (eng. Randomized Search)}.
Während bei ersterer für gegebene Werte alle Parameterkombinationen
erschöpfend betrachtet wurden, wurde beim zweiten Ansatz eine Anzahl von
Kandidaten aus einem Parameterraum mit einer bestimmten zufälligen
Verteilung ausgewählt. Um bewerten zu können, ob die beiden Suchmethoden
zu verbesserten Ergebnissen führten, wurde zuerst mit Standardwerten für
die Hyper-Parameter klassifiziert. Es wurde hierbei die sogenannte
\textbf{Basislinie} ermittelt. Dabei zeigte sich, dass der
Iris-Datensatz schon bei Standardwerten viel zu gute Ergebnisse lieferte
(Erkennungsraten um 100\%), um durch systematische Parameter-Suche
deutliche Verbesserungen zu erwarten. Daher wurde dem Datensatz in einem
zweiten Schritt \textbf{Gaußsches Rauschen} hinzugefügt, wodurch
gleichzeitig \textbf{Messrauschen} realer ML-Anwendungen simuliert
werden konnte.

Unter Verwendung einer eigenen Python-Klasse wurden die
\textbf{Durchlaufzeiten} für die beiden Suchmethoden ermittelt.
Erwartungsgemäß dauerte die Rastersuche mit deutlichem Abstand am
längsten gegenüber der Zufallssuche. Allerdings wurden bei der
Zufallssuche stets nur lokale Optima für die Hyper-Parameter ermittelt,
die auch bei jedem Durchlauf zu anderen Ergebnissen führten. Die Güte
der Klassifikation hinsichtlich der Erkennungsrate wurde wieder mit
Hilfe der in Schritt 6 vorgestellten Metriken ermittelt. Während die
Rastersuche stets bessere Ergebnisse im Vergleich zur Basislinie
lieferte, schwankten diese bei der Zufallssuche und waren jeweils
geringfügig besser oder schlechter als die der Basislinie.

Ausblickend könnte das vorliegende Tutorial erweitert werden, indem der
Iris-Datensatz gegen den deutlich moderneren
\textbf{\href{https://github.com/mcnakhaee/palmerpenguins}{Pinguin-Datensatz}}
ausgetauscht wird (originales Paket für \textbf{R}:
\cite{palmerpenguins_R_2020} sowie adaptiertes Paket für
\textbf{Python}: \cite{palmerpenguins_Python_2020}). Dies würde
allerdings eine grundsätzliche Überarbeitung sämtlicher Schritte des
Tutorials bedeuten.

Wie oben erläutert, beschränkt sich das Tutorial bisher auf die
Vorstellung der ML-Algorithmen, die Werkzeuge zur Datenanalyse sowie die
Python-Bibliotheken und Programmiersysteme. In einem weiteren Schritt
könnten konkrete Hinweise gegeben werden, wie eine eigene \textbf{reale
ML-Applikation} aufgebaut und ein geeigneter Datensatz für eine
Klassifikation erzeugt werden kann.

Zukünftig könnten die im Tutorial erläuterten \textbf{systematischen
Prozessschritte} eines typischen ML-Workflows die \textbf{Grundlage} für
mögliche \textbf{Prüfgrundsätze} bilden. Mit diesen könnten
\textbf{Reviews industrieller ML-Applikationen} durchgeführt werden -
ähnlich wie bei der Bewertung der funktionalen Sicherheit von
Steuerungssoftware.

    \hypertarget{acknowledgments}{%
\section{Acknowledgments}\label{acknowledgments}}

Vor einem reichlichen Jahr wurde ich eingeladen, im Vorbereitungskomitee
für die DGUV-Fachtagung ``Künstliche Intelligenz'' mitwirken zu dürfen.
Mein Vorschlag, einen eigenen Getting-Started-Workshop für interessierte
ML-Neulinge zu gestalten, wurde dort sehr positiv aufgenommen. Das hat
mich für die Ausarbeitung des vorliegenden Tutorials sehr motiviert.

Mein besonderer Dank gilt Herrn Prof.~André Steimers, der mit langen und
sehr interessanten Fachgesprächen, dem Lesen von Rohfassungen und seiner
konstruktiven Kritik viel Zeit investierte.

Weiterhin danke ich meinen Kollegen des Dresdener Prüflabors dafür, dass
sie sich jederzeit trotz sehr hohem Prüfaufkommen Zeit für meine
themenbezogene Fachsimpelei genommen haben. Insbesondere konnte ich
während dieser Gespräche meine Gedankengänge und Formulierungen auf
Verständlichkeit und Nachvollziehbarkeit prüfen.

Abschließend möchte ich meiner Lebensgefährtin danken, dass sie erste
Textentwürfe kritisch Korrektur gelesen hat und mir ansonsten den Rücken
freigehalten hat - auch wenn ich nach Feierabend oder an den Wochenenden
programmiert und geschrieben habe. Unserem zweijährigen Sohn danke ich
für seine Geduld mit Papa. Er hätte sicherlich das ein oder andere Mal
lieber ``Die Sendung mit der Maus'' statt seltsamer Grafiken mit mir auf
dem Rechner angeschaut.

Dresden, 11.11.2022


    % Add a bibliography block to the postdoc
    
    
    % Use bibliography
    \printbibheading[heading=bibnumbered]
    \printbibliography[heading=subbibliography,keyword={URL},title={Online references}]
    \printbibliography[heading=subbibliography,keyword={book},title={Books, technical reports and others}]
    %\printbibliography[heading=subbibliography,title={Others}]
    

\end{document}