Commit 3dd06d82 authored by Tediloma's avatar Tediloma
Browse files

paper ver 2.0

parent d055f588
...@@ -17,30 +17,26 @@ ...@@ -17,30 +17,26 @@
\providecommand\HyField@AuxAddToFields[1]{} \providecommand\HyField@AuxAddToFields[1]{}
\providecommand\HyField@AuxAddToCoFields[2]{} \providecommand\HyField@AuxAddToCoFields[2]{}
\citation{estevam2020zeroshot} \citation{estevam2020zeroshot}
\citation{cao2019openpose}
\citation{duan2021revisiting}
\citation{Liu_2020}
\citation{jasani2019skeleton} \citation{jasani2019skeleton}
\citation{reimers2019sentencebert}
\@writefile{toc}{\contentsline {section}{\numberline {1}\hskip -1em.\nobreakspace {}Introduction}{1}{section.1}\protected@file@percent } \@writefile{toc}{\contentsline {section}{\numberline {1}\hskip -1em.\nobreakspace {}Introduction}{1}{section.1}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {1.1}\hskip -1em.\nobreakspace {}Zero-shot learning}{1}{subsection.1.1}\protected@file@percent } \@writefile{toc}{\contentsline {section}{\numberline {2}\hskip -1em.\nobreakspace {}Method}{1}{section.2}\protected@file@percent }
\@writefile{toc}{\contentsline {subsection}{\numberline {1.2}\hskip -1em.\nobreakspace {}Skeleton-based visual recognition}{1}{subsection.1.2}\protected@file@percent } \newlabel{method}{{2}{1}{\hskip -1em.~Method}{section.2}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {1.3}\hskip -1em.\nobreakspace {}Related work}{1}{subsection.1.3}\protected@file@percent }
\citation{jasani2019skeleton} \citation{jasani2019skeleton}
\citation{jasani2019skeleton} \citation{jasani2019skeleton}
\citation{yan2018spatial} \citation{yan2018spatial}
\citation{reimers2019sentencebert} \citation{reimers2019sentencebert}
\citation{sung2018learning} \citation{sung2018learning}
\citation{cao2019openpose}
\citation{duan2021revisiting}
\citation{yan2018spatial} \citation{yan2018spatial}
\citation{reimers2019sentencebert}
\citation{jasani2019skeleton} \citation{jasani2019skeleton}
\citation{Pagliardini_2018} \citation{Pagliardini_2018}
\citation{sung2018learning} \citation{sung2018learning}
\citation{sung2018learning} \citation{sung2018learning}
\citation{sung2018learning}
\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Overview of the network modules.}}{2}{figure.1}\protected@file@percent } \@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Overview of the network modules.}}{2}{figure.1}\protected@file@percent }
\newlabel{architecture}{{1}{2}{Overview of the network modules}{figure.1}{}} \newlabel{architecture}{{1}{2}{Overview of the network modules}{figure.1}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {1.4}\hskip -1em.\nobreakspace {}Data augmentation}{2}{subsection.1.4}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {2}\hskip -1em.\nobreakspace {}Method}{2}{section.2}\protected@file@percent }
\newlabel{method}{{2}{2}{\hskip -1em.~Method}{section.2}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {2.1}\hskip -1em.\nobreakspace {}Architecture}{2}{subsection.2.1}\protected@file@percent } \@writefile{toc}{\contentsline {subsection}{\numberline {2.1}\hskip -1em.\nobreakspace {}Architecture}{2}{subsection.2.1}\protected@file@percent }
\@writefile{toc}{\contentsline {subsubsection}{\numberline {2.1.1}Visual path}{2}{subsubsection.2.1.1}\protected@file@percent } \@writefile{toc}{\contentsline {subsubsection}{\numberline {2.1.1}Visual path}{2}{subsubsection.2.1.1}\protected@file@percent }
\@writefile{toc}{\contentsline {subsubsection}{\numberline {2.1.2}Semantic Path}{2}{subsubsection.2.1.2}\protected@file@percent } \@writefile{toc}{\contentsline {subsubsection}{\numberline {2.1.2}Semantic Path}{2}{subsubsection.2.1.2}\protected@file@percent }
...@@ -48,6 +44,7 @@ ...@@ -48,6 +44,7 @@
\@writefile{toc}{\contentsline {subsection}{\numberline {2.2}\hskip -1em.\nobreakspace {}Augmentation}{2}{subsection.2.2}\protected@file@percent } \@writefile{toc}{\contentsline {subsection}{\numberline {2.2}\hskip -1em.\nobreakspace {}Augmentation}{2}{subsection.2.2}\protected@file@percent }
\citation{ma2019nlpaug} \citation{ma2019nlpaug}
\citation{liu2019roberta} \citation{liu2019roberta}
\citation{Liu_2020}
\citation{jasani2019skeleton} \citation{jasani2019skeleton}
\citation{sung2018learning} \citation{sung2018learning}
\@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces Three descriptive labels for the class "Squat down".}}{3}{table.1}\protected@file@percent } \@writefile{lot}{\contentsline {table}{\numberline {1}{\ignorespaces Three descriptive labels for the class "Squat down".}}{3}{table.1}\protected@file@percent }
...@@ -59,21 +56,21 @@ ...@@ -59,21 +56,21 @@
\newlabel{tab:auto_aug}{{2}{3}{Descriptive label and two automatic augmentations for "Squat down"}{table.2}{}} \newlabel{tab:auto_aug}{{2}{3}{Descriptive label and two automatic augmentations for "Squat down"}{table.2}{}}
\@writefile{lot}{\contentsline {table}{\numberline {3}{\ignorespaces ZSL and GZSL results for different approaches.}}{3}{table.3}\protected@file@percent } \@writefile{lot}{\contentsline {table}{\numberline {3}{\ignorespaces ZSL and GZSL results for different approaches.}}{3}{table.3}\protected@file@percent }
\newlabel{tab:ZSL_GZSL}{{3}{3}{ZSL and GZSL results for different approaches}{table.3}{}} \newlabel{tab:ZSL_GZSL}{{3}{3}{ZSL and GZSL results for different approaches}{table.3}{}}
\@writefile{lot}{\contentsline {table}{\numberline {4}{\ignorespaces Unseen top-1 and top-5 accuracies in detail.}}{3}{table.4}\protected@file@percent }
\newlabel{tab:top1_top5}{{4}{3}{Unseen top-1 and top-5 accuracies in detail}{table.4}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {2.3}\hskip -1em.\nobreakspace {}Experiments}{3}{subsection.2.3}\protected@file@percent } \@writefile{toc}{\contentsline {subsection}{\numberline {2.3}\hskip -1em.\nobreakspace {}Experiments}{3}{subsection.2.3}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {3}\hskip -1em.\nobreakspace {}Results}{3}{section.3}\protected@file@percent } \@writefile{toc}{\contentsline {section}{\numberline {3}\hskip -1em.\nobreakspace {}Results}{3}{section.3}\protected@file@percent }
\citation{jasani2019skeleton} \citation{jasani2019skeleton}
\citation{ma2019nlpaug} \citation{ma2019nlpaug}
\bibstyle{ieee_fullname} \@writefile{lot}{\contentsline {table}{\numberline {4}{\ignorespaces Unseen top-1 and top-5 accuracies in detail.}}{4}{table.4}\protected@file@percent }
\bibdata{egbib} \newlabel{tab:top1_top5}{{4}{4}{Unseen top-1 and top-5 accuracies in detail}{table.4}{}}
\bibcite{cao2019openpose}{1}
\bibcite{duan2021revisiting}{2}
\@writefile{toc}{\contentsline {subsection}{\numberline {3.1}\hskip -1em.\nobreakspace {}Discussion}{4}{subsection.3.1}\protected@file@percent } \@writefile{toc}{\contentsline {subsection}{\numberline {3.1}\hskip -1em.\nobreakspace {}Discussion}{4}{subsection.3.1}\protected@file@percent }
\@writefile{toc}{\contentsline {subsubsection}{\numberline {3.1.1}From default to descriptive labels}{4}{subsubsection.3.1.1}\protected@file@percent } \@writefile{toc}{\contentsline {subsubsection}{\numberline {3.1.1}From default to descriptive labels}{4}{subsubsection.3.1.1}\protected@file@percent }
\@writefile{toc}{\contentsline {subsubsection}{\numberline {3.1.2}Using multiple labels}{4}{subsubsection.3.1.2}\protected@file@percent } \@writefile{toc}{\contentsline {subsubsection}{\numberline {3.1.2}Using multiple labels}{4}{subsubsection.3.1.2}\protected@file@percent }
\@writefile{toc}{\contentsline {subsubsection}{\numberline {3.1.3}Automatic augmentation}{4}{subsubsection.3.1.3}\protected@file@percent } \@writefile{toc}{\contentsline {subsubsection}{\numberline {3.1.3}Automatic augmentation}{4}{subsubsection.3.1.3}\protected@file@percent }
\@writefile{toc}{\contentsline {section}{\numberline {4}\hskip -1em.\nobreakspace {}Conclusion}{4}{section.4}\protected@file@percent } \@writefile{toc}{\contentsline {section}{\numberline {4}\hskip -1em.\nobreakspace {}Conclusion}{4}{section.4}\protected@file@percent }
\bibstyle{ieee_fullname}
\bibdata{egbib}
\bibcite{cao2019openpose}{1}
\bibcite{duan2021revisiting}{2}
\bibcite{estevam2020zeroshot}{3} \bibcite{estevam2020zeroshot}{3}
\bibcite{jasani2019skeleton}{4} \bibcite{jasani2019skeleton}{4}
\bibcite{Liu_2020}{5} \bibcite{Liu_2020}{5}
......
This is pdfTeX, Version 3.141592653-2.6-1.40.22 (MiKTeX 21.6) (preloaded format=pdflatex 2021.7.25) 26 JUL 2021 12:53 This is pdfTeX, Version 3.141592653-2.6-1.40.22 (MiKTeX 21.6) (preloaded format=pdflatex 2021.7.25) 28 JUL 2021 11:19
entering extended mode entering extended mode
**./paper_working_design.tex **./paper_working_design.tex
(paper_working_design.tex (paper_working_design.tex
...@@ -380,7 +380,12 @@ LaTeX Font Info: Trying to load font information for OT1+pcr on input line 5 ...@@ -380,7 +380,12 @@ LaTeX Font Info: Trying to load font information for OT1+pcr on input line 5
(K:\Programme\MiKTeX\tex/latex/psnfss\ot1pcr.fd (K:\Programme\MiKTeX\tex/latex/psnfss\ot1pcr.fd
File: ot1pcr.fd 2001/06/04 font definitions for OT1/pcr. File: ot1pcr.fd 2001/06/04 font definitions for OT1/pcr.
) )
Underfull \hbox (badness 10000) in paragraph at lines 72--75 Underfull \hbox (badness 1033) in paragraph at lines 69--72
\OT1/ptm/m/n/10 In [[]] a zero-shot clas-si-fi-ca-tion net-work for the NTU
[]
Underfull \hbox (badness 10000) in paragraph at lines 69--72
[] []
...@@ -388,12 +393,12 @@ Underfull \hbox (badness 10000) in paragraph at lines 72--75 ...@@ -388,12 +393,12 @@ Underfull \hbox (badness 10000) in paragraph at lines 72--75
] ]
<Architektur2.png, id=28, 885.6839pt x 440.77171pt> <Architektur2.png, id=17, 885.6839pt x 440.77171pt>
File: Architektur2.png Graphic file (type png) File: Architektur2.png Graphic file (type png)
<use Architektur2.png> <use Architektur2.png>
Package pdftex.def Info: Architektur2.png used on input line 88. Package pdftex.def Info: Architektur2.png used on input line 76.
(pdftex.def) Requested size: 237.13594pt x 118.01456pt. (pdftex.def) Requested size: 237.13594pt x 118.01456pt.
[2 <./Architektur2.png>] [3] (paper_working_design.bbl [4] [2 <./Architektur2.png>] [3] [4] (paper_working_design.bbl
Underfull \hbox (badness 10000) in paragraph at lines 40--43 Underfull \hbox (badness 10000) in paragraph at lines 40--43
[]\OT1/ptm/m/n/9 Edward Ma. Nlp aug-men-ta-tion. []\OT1/ptm/m/n/9 Edward Ma. Nlp aug-men-ta-tion.
[] []
...@@ -422,13 +427,13 @@ Underfull \hbox (badness 2941) in paragraph at lines 58--62 ...@@ -422,13 +427,13 @@ Underfull \hbox (badness 2941) in paragraph at lines 58--62
] (paper_working_design.aux) ) ] (paper_working_design.aux) )
Here is how much of TeX's memory you used: Here is how much of TeX's memory you used:
9392 strings out of 478864 9388 strings out of 478864
135386 string characters out of 2860441 135330 string characters out of 2860441
460419 words of memory out of 3000000 459419 words of memory out of 3000000
27120 multiletter control sequences out of 15000+600000 27120 multiletter control sequences out of 15000+600000
428885 words of font info for 82 fonts, out of 8000000 for 9000 428885 words of font info for 82 fonts, out of 8000000 for 9000
1141 hyphenation exceptions out of 8191 1141 hyphenation exceptions out of 8191
72i,13n,80p,1277b,490s stack positions out of 5000i,500n,10000p,200000b,80000s 72i,13n,80p,1277b,362s stack positions out of 5000i,500n,10000p,200000b,80000s
{K:/Programme/MiKTeX/fonts/enc/dvips/base/8r.enc} {K:/Programme/MiKTeX/fonts/enc/dvips/base/8r.enc}
<K:/Programme/MiKTeX/fonts/type1/public/amsfonts/cm/cmmi10.pfb><K:/Programme/Mi <K:/Programme/MiKTeX/fonts/type1/public/amsfonts/cm/cmmi10.pfb><K:/Programme/Mi
KTeX/fonts/type1/public/amsfonts/cm/cmr10.pfb><K:/Programme/MiKTeX/fonts/type1/ KTeX/fonts/type1/public/amsfonts/cm/cmr10.pfb><K:/Programme/MiKTeX/fonts/type1/
...@@ -436,9 +441,9 @@ public/amsfonts/cm/cmsy10.pfb><K:/Programme/MiKTeX/fonts/type1/urw/courier/ucrr ...@@ -436,9 +441,9 @@ public/amsfonts/cm/cmsy10.pfb><K:/Programme/MiKTeX/fonts/type1/urw/courier/ucrr
8a.pfb><K:/Programme/MiKTeX/fonts/type1/urw/times/utmb8a.pfb><K:/Programme/MiKT 8a.pfb><K:/Programme/MiKTeX/fonts/type1/urw/times/utmb8a.pfb><K:/Programme/MiKT
eX/fonts/type1/urw/times/utmr8a.pfb><K:/Programme/MiKTeX/fonts/type1/urw/times/ eX/fonts/type1/urw/times/utmr8a.pfb><K:/Programme/MiKTeX/fonts/type1/urw/times/
utmri8a.pfb> utmri8a.pfb>
Output written on paper_working_design.pdf (5 pages, 252768 bytes). Output written on paper_working_design.pdf (5 pages, 253621 bytes).
PDF statistics: PDF statistics:
142 PDF objects out of 1000 (max. 8388607) 138 PDF objects out of 1000 (max. 8388607)
44 named destinations out of 1000 (max. 500000) 40 named destinations out of 1000 (max. 500000)
6 words of extra memory for PDF output out of 10000 (max. 10000000) 6 words of extra memory for PDF output out of 10000 (max. 10000000)
...@@ -50,37 +50,25 @@ Karlsruhe Institute of Technology\\ ...@@ -50,37 +50,25 @@ Karlsruhe Institute of Technology\\
%%%%%%%%% ABSTRACT %%%%%%%%% ABSTRACT
\begin{abstract} \begin{abstract}
One of the big challenges of zero-shot learning is translating the semantic information given about an untrained class into an expectation of what visual features a sample of that class would have. This is done by creating semantic embeddings of the class labels with a text embedding network trained on a large text corpus. However, that semantic embedding will be more focused on the semantic meaning of the label rather that its visual characteristics. In this work, we present different forms of data augmentation, that can be applied to the semantic embeddings of the class labels to increase their visual information content. This approach achieves a significant performance improvement for a zero-shot gesture recognition model. Interaction with computer systems is one of the most important topics of the digital age. Recent advances in gesture recognition show: controlling a system only by moving parts of the own body can provide advantages over the physical interaction. To perform this task, the system needs to reliably detect the performed gestures. For current systems using deep learning methods this mean they have to be trained on all possible gesture beforehand. This is where the method of Zero-Shot learning comes in, which allows to also recognize gestures not seen during training.
Here, one of the big challenges is to translate the semantic information given about an unseen class into an expectation of what visual features a sample of that class would have. Using typical semantic embeddings like BERT, that semantic information will be more focused on the semantic meaning of the label rather than its visual characteristics. In this work, we present different forms of data augmentation, that can be applied to the semantic embeddings of the class labels to increase their visual information content. This approach achieves a significant performance improvement for a zero-shot gesture recognition model.
\end{abstract} \end{abstract}
%%%%%%%%% BODY TEXT %%%%%%%%% BODY TEXT
\section{Introduction} \section{Introduction}
Gesture recognition in videos is a rapidly growing field of research and could become an important component for input-device-less control of consumer products such as drones or televisions. While various past works have focused on the classification of gestures known in advance, this work deals with gesture recognition using the zero-shot learning approach. Such a task is interesting, because the zero-shot approach does not only allow the use of learned gestures for fixed commands. Additionally, it is possible to incorporate untrained gestures. The user of the product is thus offered the opportunity to expand the command set for controlling the device. Gesture recognition in videos is a rapidly growing field of research and could become an important component for input-device-less control of consumer products such as drones or televisions. While various past works have focused on the classification of gestures known in advance, this work deals with gesture recognition using the zero-shot learning approach. Such a task is interesting, because the zero-shot approach does not only allow the use of learned gestures for fixed commands. Additionally, it is possible to incorporate untrained gestures. The user of the product is thus offered the opportunity to expand the command set for controlling the device.\\
\indent In order to be able to classify samples of untrained (also called "unseen") classes, a network needs to have an expectation of what the gesture corresponding to that class’s label might look like. This is usually done through text embeddings \cite{estevam2020zeroshot}: Trained on unannotated text data, language embedding models extract meaning from words or sentences by converting them into a semantic embedding vector. After creating a semantic embedding for each class label, it is possible to compare the embeddings of unseen classes with those of seen classes to determine what characteristics those classes share. If those similarities are also present in the visual input samples, the network can deduce that the input sample belongs to that specific unseen class.\\
%------------------------------------------------------------------------- \indent It is quite common to apply data augmentation techniques such as cropping, scaling or flipping to the video input of a network in order to increase the amount of available training samples. However, in zero-shot learning there are two different, equally important kinds of training information for each class: visual and semantic. These common data augmentation strategies make it possible to multiply the amount of visual training data, but the semantic information remains minimal, usually restricted to the simple label of the class.
\subsection{Zero-shot learning}
In order to be able to classify samples of untrained (also called "unseen") classes, a network needs to have an expectation of what the gesture corresponding to that class’s label might look like. This is usually done through text embeddings \cite{estevam2020zeroshot}: Trained on unannotated text data, language embedding models extract meaning from words or sentences by converting them into a semantic embedding vector. After creating a semantic embedding for each class label, it is possible to compare the embeddings of unseen classes with those of seen classes to determine what characteristics those classes share. If those similarities are also present in the visual input samples, the network can deduce that the input sample belongs to that specific unseen class.
\subsection{Skeleton-based visual recognition}
RGB videos contain a lot of information, which is not necessary to recognize the performed gesture, such as the background or a person’s clothing. To reduce the amount of unnecessary detail we use a temporal series of skeletons as input data. Each skeleton is a graph whose nodes represent the person’s joints. A full input sample consists of a series of one skeleton graph per frame. Such skeleton data can be obtained from RGB video using a framework like openpose \cite{cao2019openpose}. Since gestures are fully defined by the motion of a person’s limbs, it is possible for an appropriate network to recognize them based on an input of this form \cite{duan2021revisiting}. In this work we use the NTU RGB+D 120 dataset \cite{Liu_2020}, which contains 3D skeleton data for 114,480 samples of 120 different human action classes.
%-------------------------------------------------------------------------
\subsection{Related work}
In \cite{jasani2019skeleton} a zero-shot classification network for the NTU dataset was created. Their architecture features a multilayer perceptron (MLP) to map the semantic embeddings of the class labels into the visual features space and another MLP that learns a deep similarity metric between those semantic features and the visual features of a given input sample.\\
\\
Sentence BERT (SBERT) \cite{reimers2019sentencebert} is a text embedding module that takes a sentence as input, analyzes it and gives two kinds of outputs: a cls-token vector, that is a representation of the entire sentence, and a series of embedding vectors, that each represent one word of the input sentence with its context. A mean token vector can be created out of this secondary output by applying an attention mask to the series of tokens to combine them into a single one. This way two separate semantic embeddings can be generated for each input sentence: a cls-token and a mean-token.
\subsection{Data augmentation}
It is quite common to apply data augmentation techniques such as cropping, scaling or flipping to the video input of a network in order to increase the amount of available training samples. However, in zero-shot learning there are two different, equally important kinds of training information for each class: visual and semantic. These common data augmentation strategies make it possible to multiply the amount of visual training data, but the semantic information remains minimal, usually restricted to the simple label of the class.
We aim to provide the network more relevant semantic information about the different classes by applying several forms of data augmentation to the sematic embeddings of the class labels. We aim to provide the network more relevant semantic information about the different classes by applying several forms of data augmentation to the sematic embeddings of the class labels.
\section{Method} \section{Method}
\label{method} \label{method}
First we need to build a network capable of zero-shot learning for gesture recognition, and assess the baseline performance. Then we apply different forms of data augmentation to the semantic embeddings of the class labels and compare the classification accuracy. For our tests we use 40 gesture classes from the NTU RGB+D 120 dataset. First we need to build a network capable of zero-shot learning for gesture recognition, and assess the baseline performance. Then we apply different forms of data augmentation to the semantic embeddings of the class labels and compare the classification accuracy.
\\
In \cite{jasani2019skeleton} a zero-shot classification network for the NTU dataset was created. Their architecture features a multilayer perceptron (MLP) to map the semantic embeddings of the class labels into the visual features space and another MLP that learns a deep similarity metric between those semantic features and the visual features of a given input sample.\\
\begin{figure}[t] \begin{figure}[t]
\begin{center} \begin{center}
...@@ -96,13 +84,22 @@ The architecture chosen for our experiments largely corresponds to the model pre ...@@ -96,13 +84,22 @@ The architecture chosen for our experiments largely corresponds to the model pre
The architecture consists of three parts described in the following sections. The architecture consists of three parts described in the following sections.
\subsubsection{Visual path} \subsubsection{Visual path}
The task of the visual path is the feature extraction of a video sample (in form of a temporal series of 3D-Skeletons). The Graph Convolutional Net (GCN) from \cite{yan2018spatial} is used as a feature extractor, which we train exclusively on the 80 unused classes of the NTU RGB+D 120 dataset. This ensures that the unseen gestures have not already appeared at some early point in the training process before inference.
RGB videos contain a lot of information, which is not necessary to recognize the performed gesture, such as the background or a person’s clothing. To reduce the amount of unnecessary detail we use a temporal series of skeletons as input data. Each skeleton is a graph whose nodes represent the person’s joints. A full input sample consists of a series of one skeleton graph per frame. Such skeleton data can be obtained from RGB video using a framework like openpose \cite{cao2019openpose}. Since gestures are fully defined by the motion of a person’s limbs, it is possible for an appropriate network to recognize them based on an input of this form \cite{duan2021revisiting}.
The task of the visual path is the feature extraction of a video sample (in form of a temporal series of 3D-Skeletons). The Graph Convolutional Net (GCN) from \cite{yan2018spatial} is used as a feature extractor. Output: 1x256
\subsubsection{Semantic Path} \subsubsection{Semantic Path}
Sentence BERT (SBERT) \cite{reimers2019sentencebert} is a text embedding module that takes a sentence as input, analyzes it and gives two kinds of outputs: a cls-token vector, that is a representation of the entire sentence, and a series of embedding vectors, that each represent one word of the input sentence with its context. A mean token vector can be created out of this secondary output by applying an attention mask to the series of tokens to combine them into a single one. This way two separate semantic embeddings can be generated for each input sentence: a cls-token and a mean-token.
\\
The semantic path consists of two modules. The first is a SBERT module, that has the task of transforming the vocabulary, i.e. all possible class labels, into semantic embeddings. This is different from the original architecture in \cite{jasani2019skeleton}, where a Sent2Vec module \cite{Pagliardini_2018} is used. We use the mean-token output of the SBERT module over the cls-token as our semantic embedding because it resulted in a better performance. The attribute network (AN) then transforms the semantic embeddings into semantic features by mapping them into the visual feature space. The AN is introduced in \cite{sung2018learning} where it contributes a significant part to the solution of the ZSL task along with the Relation Net (RN), which is explained in more detail in the following section. We apply dropout to the first layer of the AN with a factor of 0.5. The semantic path consists of two modules. The first is a SBERT module, that has the task of transforming the vocabulary, i.e. all possible class labels, into semantic embeddings. This is different from the original architecture in \cite{jasani2019skeleton}, where a Sent2Vec module \cite{Pagliardini_2018} is used. We use the mean-token output of the SBERT module over the cls-token as our semantic embedding because it resulted in a better performance. The attribute network (AN) then transforms the semantic embeddings into semantic features by mapping them into the visual feature space. The AN is introduced in \cite{sung2018learning} where it contributes a significant part to the solution of the ZSL task along with the Relation Net (RN), which is explained in more detail in the following section. We apply dropout to the first layer of the AN with a factor of 0.5.
\subsubsection{Similarity-Learning-Part} \subsubsection{Similarity-Learning-Part}
Here we first form the relation pairs by pairwise concatenating the visual features of our sample with the semantic features of each class. These relation pairs are then fed into the relation network (RN) introduced in \cite{sung2018learning}, which is another MLP. We add an additional linear layer and apply dropout to the first and second layer with a factor of 0.5. The RN learns a deep similarity metric in order to assess the similarity of the semantic and visual features within each relation pair. This way, it computes a similarity score for each pair, which symbolizes the input sample's similarity to each possible class. Here we first form the relation pairs by pairwise concatenating the visual features of our sample with the semantic features of each class. These relation pairs are then fed into the relation network (RN) introduced in \cite{sung2018learning}, which is another MLP. We add an additional linear layer and apply dropout to the first and second layer with a factor of 0.5. The RN applies a similarity metric in order to assess the similarity of the semantic and visual features within each relation pair. In contrast to previous work, we do not use a fixed similarity metric. Instead, the RN learns a deep similarity metric during training, which was introduced and shown to improve performance in \cite{sung2018learning}.
This way, it computes a similarity score for each pair, which symbolizes the input sample's similarity to each possible class. The loss is calculated by comparing the similyrity scores with a one-hot representation of the ground truth (MSE).
\subsection{Augmentation} \subsection{Augmentation}
...@@ -114,7 +111,7 @@ In a first step, we increase the information content by replacing the class labe ...@@ -114,7 +111,7 @@ In a first step, we increase the information content by replacing the class labe
\subsubsection{Multiple labels per class} \subsubsection{Multiple labels per class}
We now increase the information content of the semanic embeddings even further by labeling each gesture with several different descriptions. Thus, we manually created two additional descriptions for each gesture that used different wording. Consequently each class now has three descriptive labels. An example label set is shown in table \ref{tab:multi_label}. In each iteration of the training process, the ground truth of a training video sample is randomly selected from one of the three possible labels. During inference, all three possibilities are considered correct if the network predicts one of them for the corresponding sample. We now increase the information content of the semanic embeddings even further by labeling each gesture with several different descriptions. Thus, we manually create two additional descriptions for each gesture that used different wording. Consequently each class now has three descriptive labels. An example label set is shown in table \ref{tab:multi_label}. The network computes a similarity score for each possible label, meaning that due to the increased vocabulary there are now 3 times as many similarity scores. In each iteration of the training process, the ground truth of a training video sample is randomly selected from one of the three possible labels. During inference, all three possibilities are considered correct if the network predicts one of them for the corresponding sample.
\begin{table} \begin{table}
\begin{center} \begin{center}
...@@ -159,6 +156,10 @@ An example label set is shown in table \ref{tab:auto_aug}. One can see, that the ...@@ -159,6 +156,10 @@ An example label set is shown in table \ref{tab:auto_aug}. One can see, that the
\subsection{Experiments} \subsection{Experiments}
In this work we use the NTU RGB+D 120 dataset \cite{Liu_2020}, which contains 3D skeleton data for 114,480 samples of 120 different human action classes. For our tests we use 40 gesture classes from the NTU RGB+D 120 dataset. We train the GCN exclusively on the 80 unused classes of the NTU RGB+D 120 dataset. This ensures that the unseen gestures have not already appeared at some early point in the training process before inference
In order to evaluate an augmentation method, we do training runs on eight random 35/5 (seen/unseen) splits, in such a way, that every single class is unseen in exactly one training run. During training, only the weights of the AN and RN modules are adjusted. All other modules remain unchanged after their individual training. After testing, the accuracies are averaged over the eight individual experiments. For each augmentation method we test the performance in two scenarios: In the ZSL scenario, the model only predicts on the unseen classes, while it predicts on all classes (seen and unseen) in the GZSL scenario. In the latter we measure the accuracy for seen and unseen samples, as well as the harmonic mean, following recent works \cite{jasani2019skeleton}. For default and descriptive labels, we train our Network with a batch size of 32 and without batch norm, as was done in the original paper \cite{sung2018learning} For the multi labels however, we used a batch size of 128 and batch norm at the input of the RN. This was mainly done due to performance reasons because the multi label approach with more than 3 labels did not learn anything without batch norm at all. %batchnorm in general -> decrease in unseen In order to evaluate an augmentation method, we do training runs on eight random 35/5 (seen/unseen) splits, in such a way, that every single class is unseen in exactly one training run. During training, only the weights of the AN and RN modules are adjusted. All other modules remain unchanged after their individual training. After testing, the accuracies are averaged over the eight individual experiments. For each augmentation method we test the performance in two scenarios: In the ZSL scenario, the model only predicts on the unseen classes, while it predicts on all classes (seen and unseen) in the GZSL scenario. In the latter we measure the accuracy for seen and unseen samples, as well as the harmonic mean, following recent works \cite{jasani2019skeleton}. For default and descriptive labels, we train our Network with a batch size of 32 and without batch norm, as was done in the original paper \cite{sung2018learning} For the multi labels however, we used a batch size of 128 and batch norm at the input of the RN. This was mainly done due to performance reasons because the multi label approach with more than 3 labels did not learn anything without batch norm at all. %batchnorm in general -> decrease in unseen
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment