Commit 906df2aa authored by uoega's avatar uoega
Browse files

added citation and ref to results chapter

parent d5f49efe
......@@ -20,11 +20,36 @@
title={Spatial Temporal Graph Convolutional Networks for Skeleton-Based Action Recognition},
author={Sijie Yan and Yuanjun Xiong and Dahua Lin},
year={2018},
journal={1801.07455},
journal={arXiv:1801.07455},
archivePrefix={arXiv},
primaryClass={cs.CV}
}
@misc{ma2019nlpaug,
title={NLP Augmentation},
author={Edward Ma},
howpublished={https://github.com/makcedward/nlpaug},
year={2019}
}
@article{liu2019roberta,
title={RoBERTa: A Robustly Optimized BERT Pretraining Approach},
author={Yinhan Liu and Myle Ott and Naman Goyal and Jingfei Du and Mandar Joshi and Danqi Chen and Omer Levy and Mike Lewis and Luke Zettlemoyer and Veselin Stoyanov},
year={2019},
journal={arXiv:1907.11692},
archivePrefix={arXiv}
}
@article{reimers2019sentencebert,
title={Sentence-BERT: Sentence Embeddings using Siamese BERT-Networks},
author={Nils Reimers and Iryna Gurevych},
year={2019},
jornal={arXiv:1908.10084},
archivePrefix={arXiv},
primaryClass={cs.CL}
}
@misc{Authors14,
author = {Authors},
title = {The frobnicatable foo filter},
......
......@@ -20,9 +20,13 @@
\@writefile{toc}{\contentsline {subsection}{\numberline {1.1}\hskip -1em.\nobreakspace {}Zero-shot learning}{1}{subsection.1.1}}
\@writefile{toc}{\contentsline {subsection}{\numberline {1.2}\hskip -1em.\nobreakspace {}Skeleton-based visual recognition}{1}{subsection.1.2}}
\@writefile{toc}{\contentsline {subsection}{\numberline {1.3}\hskip -1em.\nobreakspace {}Data augmentation}{1}{subsection.1.3}}
\citation{liu2019roberta}
\citation{ma2019nlpaug}
\@writefile{toc}{\contentsline {section}{\numberline {2}\hskip -1em.\nobreakspace {}Method}{2}{section.2}}
\@writefile{toc}{\contentsline {subsection}{\numberline {2.1}\hskip -1em.\nobreakspace {}Augmentations}{2}{subsection.2.1}}
\@writefile{toc}{\contentsline {subsubsection}{\numberline {2.1.1}Automatic Augmentation}{2}{subsubsection.2.1.1}}
\citation{jasani2019skeleton}
\citation{sung2018learning}
\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Architecture or other needed for method}}{3}{figure.1}}
\newlabel{fig:long}{{1}{3}{Architecture or other needed for method}{figure.1}{}}
\newlabel{fig:onecol}{{1}{3}{Architecture or other needed for method}{figure.1}{}}
......@@ -34,15 +38,21 @@
\newlabel{fig:long}{{3}{3}{aug example2}{figure.3}{}}
\newlabel{fig:onecol}{{3}{3}{aug example2}{figure.3}{}}
\@writefile{lot}{\contentsline {table}{\numberline {2}{\ignorespaces ZSL and GZSL results for different approaches.}}{3}{table.2}}
\newlabel{tab:ZSL_GZSL}{{2}{3}{ZSL and GZSL results for different approaches}{table.2}{}}
\@writefile{lot}{\contentsline {table}{\numberline {3}{\ignorespaces Unseen top-1 and top-5 accuracies results in detail.}}{3}{table.3}}
\newlabel{tab:top1_top5}{{3}{3}{Unseen top-1 and top-5 accuracies results in detail}{table.3}{}}
\@writefile{toc}{\contentsline {subsection}{\numberline {2.2}\hskip -1em.\nobreakspace {}Experiments}{3}{subsection.2.2}}
\@writefile{toc}{\contentsline {section}{\numberline {3}\hskip -1em.\nobreakspace {}Results}{3}{section.3}}
\citation{jasani2019skeleton}
\citation{ma2019nlpaug}
\bibstyle{ieee_fullname}
\bibdata{egbib}
\bibcite{Alpher02}{1}
\bibcite{Alpher03}{2}
\bibcite{Alpher04}{3}
\bibcite{Authors14}{4}
\bibcite{Authors14b}{5}
\@writefile{lot}{\contentsline {table}{\numberline {3}{\ignorespaces Unseen top-1 and top-5 accuracies results in detail.}}{4}{table.3}}
\bibcite{jasani2019skeleton}{1}
\@writefile{toc}{\contentsline {section}{\numberline {3}\hskip -1em.\nobreakspace {}Results}{4}{section.3}}
\@writefile{toc}{\contentsline {subsection}{\numberline {3.1}\hskip -1em.\nobreakspace {}Discussion}{4}{subsection.3.1}}
\@writefile{toc}{\contentsline {subsubsection}{\numberline {3.1.1}From default to descriptive labels}{4}{subsubsection.3.1.1}}
\@writefile{toc}{\contentsline {subsubsection}{\numberline {3.1.2}Using multiple labels}{4}{subsubsection.3.1.2}}
\@writefile{toc}{\contentsline {subsubsection}{\numberline {3.1.3}Automatic augmentation}{4}{subsubsection.3.1.3}}
\@writefile{toc}{\contentsline {section}{\numberline {4}\hskip -1em.\nobreakspace {}Conclusion}{4}{section.4}}
\bibcite{liu2019roberta}{2}
\bibcite{ma2019nlpaug}{3}
\bibcite{sung2018learning}{4}
\begin{thebibliography}{1}\itemsep=-1pt
\bibitem{Alpher02}
FirstName Alpher.
\newblock Frobnication.
\newblock {\em Journal of Foo}, 12(1):234--778, 2002.
\bibitem{jasani2019skeleton}
Bhavan Jasani and Afshaan Mazagonwalla.
\newblock Skeleton based zero shot action recognition in joint pose-language
semantic space.
\newblock {\em arXiv:1911.11344}, 2019.
\bibitem{Alpher03}
FirstName Alpher and FirstName Fotheringham-Smythe.
\newblock Frobnication revisited.
\newblock {\em Journal of Foo}, 13(1):234--778, 2003.
\bibitem{liu2019roberta}
Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer
Levy, Mike Lewis, Luke Zettlemoyer, and Veselin Stoyanov.
\newblock Roberta: A robustly optimized bert pretraining approach.
\newblock {\em arXiv:1907.11692}, 2019.
\bibitem{Alpher04}
FirstName Alpher, FirstName Fotheringham-Smythe, and FirstName Gamow.
\newblock Can a machine frobnicate?
\newblock {\em Journal of Foo}, 14(1):234--778, 2004.
\bibitem{ma2019nlpaug}
Edward Ma.
\newblock Nlp augmentation.
\newblock https://github.com/makcedward/nlpaug, 2019.
\bibitem{Authors14}
Authors.
\newblock The frobnicatable foo filter, 2014.
\newblock Face and Gesture submission ID 324. Supplied as additional material
{\tt fg324.pdf}.
\bibitem{Authors14b}
Authors.
\newblock Frobnication tutorial, 2014.
\newblock Supplied as additional material {\tt tr.pdf}.
\bibitem{sung2018learning}
Flood Sung, Yongxin Yang, Li Zhang, Tao Xiang, Philip H.~S. Torr, and
Timothy~M. Hospedales.
\newblock Learning to compare: Relation network for few-shot learning.
\newblock {\em arXiv:1711.06025}, 2018.
\end{thebibliography}
This is pdfTeX, Version 3.14159265-2.6-1.40.19 (MiKTeX 2.9.6840 64-bit) (preloaded format=pdflatex 2018.10.16) 24 JUL 2021 08:53
This is pdfTeX, Version 3.14159265-2.6-1.40.19 (MiKTeX 2.9.6840 64-bit) (preloaded format=pdflatex 2018.10.16) 24 JUL 2021 22:08
entering extended mode
**./paper_working_design.tex
(paper_working_design.tex
......@@ -444,6 +444,21 @@ File: Architektur.png Graphic file (type png)
Package pdftex.def Info: Architektur.png used on input line 104.
(pdftex.def) Requested size: 189.70947pt x 97.11714pt.
Underfull \hbox (badness 4181) in paragraph at lines 113--114
\OT1/ptm/m/n/10 To re-duce the man-ual an-no-ta-tion ef-fort, we would
[]
Underfull \hbox (badness 6477) in paragraph at lines 113--114
\OT1/ptm/m/n/10 like to gen-er-ate ad-di-tional la-bels au-to-mat-i-cally for
[]
Underfull \hbox (badness 1888) in paragraph at lines 113--114
\OT1/ptm/m/n/10 the multi la-bel ap-proach. There-for we're us-ing the
[]
Underfull \hbox (badness 10000) in paragraph at lines 113--114
[]
......@@ -463,54 +478,51 @@ Overfull \hbox (16.13214pt too wide) in paragraph at lines 129--139
[]
[]
<aug_example1.png, id=25, 1195.08984pt x 126.4725pt>
<aug_example1.png, id=27, 1195.08984pt x 126.4725pt>
File: aug_example1.png Graphic file (type png)
<use aug_example1.png>
Package pdftex.def Info: aug_example1.png used on input line 146.
(pdftex.def) Requested size: 213.4209pt x 22.58458pt.
<aug_example2.png, id=26, 888.31876pt x 242.78203pt>
<aug_example2.png, id=28, 888.31876pt x 242.78203pt>
File: aug_example2.png Graphic file (type png)
<use aug_example2.png>
Package pdftex.def Info: aug_example2.png used on input line 156.
(pdftex.def) Requested size: 213.4209pt x 58.32814pt.
Underfull \vbox (badness 1533) has occurred while \output is active []
Underfull \vbox (badness 5711) has occurred while \output is active []
[3 <./Architektur.png> <./aug_example1.png> <./aug_example2.png>]
Underfull \vbox (badness 1325) has occurred while \output is active []
Underfull \vbox (badness 6412) has occurred while \output is active []
(paper_working_design.bbl
Underfull \hbox (badness 2376) in paragraph at lines 9--12
[]\OT1/ptm/m/n/9 FirstName Alpher and First-Name Fotheringham-Smythe.
[]
Underfull \hbox (badness 1132) in paragraph at lines 14--17
[]\OT1/ptm/m/n/9 FirstName Alpher, First-Name Fotheringham-Smythe, and
(paper_working_design.bbl [4]
Underfull \hbox (badness 10000) in paragraph at lines 16--19
[]\OT1/ptm/m/n/9 Edward Ma. Nlp aug-men-ta-tion.
[]
)
Package atveryend Info: Empty hook `BeforeClearDocument' on input line 236.
[4]
Package atveryend Info: Empty hook `AfterLastShipout' on input line 236.
Package atveryend Info: Empty hook `BeforeClearDocument' on input line 239.
[5
]
Package atveryend Info: Empty hook `AfterLastShipout' on input line 239.
(paper_working_design.aux)
Package atveryend Info: Executing hook `AtVeryEndDocument' on input line 236.
Package atveryend Info: Empty hook `AtEndAfterFileList' on input line 236.
Package atveryend Info: Executing hook `AtVeryEndDocument' on input line 239.
Package atveryend Info: Empty hook `AtEndAfterFileList' on input line 239.
LaTeX Warning: There were multiply-defined labels.
Package atveryend Info: Empty hook `AtVeryVeryEnd' on input line 236.
Package atveryend Info: Empty hook `AtVeryVeryEnd' on input line 239.
)
Here is how much of TeX's memory you used:
6255 strings out of 492970
91935 string characters out of 3126593
192741 words of memory out of 3000000
10012 multiletter control sequences out of 15000+200000
28580 words of font info for 67 fonts, out of 3000000 for 9000
6261 strings out of 492970
92074 string characters out of 3126593
189874 words of memory out of 3000000
10014 multiletter control sequences out of 15000+200000
29095 words of font info for 69 fonts, out of 3000000 for 9000
1141 hyphenation exceptions out of 8191
32i,13n,27p,1165b,324s stack positions out of 5000i,500n,10000p,200000b,50000s
32i,13n,27p,1165b,468s stack positions out of 5000i,500n,10000p,200000b,50000s
{C:/Users/XPS15/AppData/Local/Programs/MiKTeX 2.9/fonts/enc/dvips/base/8r.enc
}<C:/Users/XPS15/AppData/Local/Programs/MiKTeX 2.9/fonts/type1/public/amsfonts/
cm/cmmi10.pfb><C:/Users/XPS15/AppData/Local/Programs/MiKTeX 2.9/fonts/type1/pub
......@@ -520,9 +532,9 @@ iKTeX 2.9/fonts/type1/urw/courier/ucrr8a.pfb><C:/Users/XPS15/AppData/Local/Prog
rams/MiKTeX 2.9/fonts/type1/urw/times/utmb8a.pfb><C:/Users/XPS15/AppData/Local/
Programs/MiKTeX 2.9/fonts/type1/urw/times/utmr8a.pfb><C:/Users/XPS15/AppData/Lo
cal/Programs/MiKTeX 2.9/fonts/type1/urw/times/utmri8a.pfb>
Output written on paper_working_design.pdf (4 pages, 551837 bytes).
Output written on paper_working_design.pdf (5 pages, 557661 bytes).
PDF statistics:
87 PDF objects out of 1000 (max. 8388607)
28 named destinations out of 1000 (max. 500000)
102 PDF objects out of 1000 (max. 8388607)
31 named destinations out of 1000 (max. 500000)
16 words of extra memory for PDF output out of 10000 (max. 10000000)
......@@ -110,7 +110,7 @@ Die zwei letztgenannten Module AN und RN aus [Learning2Compare] sind es auch, di
\subsection{Augmentations}
\subsubsection{Automatic Augmentation}
To reduce the manual annotation effort, we would like to generate additional labels automatically for the multi label approach. Therefor we’re using the ContextualWordEmbsAug Augmenter with RoBERTa [liu2019roberta] language model from nlpaug [CITATION] to insert words into a descriptive embedding. We decided on insertions and not substitutions or deletions, since these did not perform well in our tests. (For substitutions with synonyms, we would have expected a better performance, but it turned out that there weren’t enough synonyms for the key words in our sentences.) For the class squat down an example for the used word insertions would be:\\
To reduce the manual annotation effort, we would like to generate additional labels automatically for the multi label approach. Therefor we’re using the \verb'ContextualWordEmbsAug' Augmenter with RoBERTa \cite{liu2019roberta} language model from \verb'nlpaug' \cite{ma2019nlpaug} to insert words into a descriptive label. We decided on insertions and not substitutions or deletions, since these did not perform well in our tests. (For substitutions with synonyms, we would have expected a better performance, but it turned out that there weren’t enough synonyms for the key words in our sentences.) For the class squat down an example for the used word insertions would be:\\
\noindent
{\bf Description:} A human crouches down by bending their knees.\\
......@@ -164,7 +164,7 @@ One can see, that the augmented sentences are not necessarily grammatically corr
\subsection{Experiments}
For evaluating our model, we do training runs on 8 random 35/5 splits, which include every class once, such that every class is used as an unseen class once [TABLE]. The accuracies however are averaged of the eight individual experiments. For each approach we’re calculating the top-1 accuracy over only the 5 unseen classes (ZSL) and on seen and unseen test data and the harmonic mean, following recent works [CITATION synse or main paper] (GZSL). For default and descriptive labels, we train our Network with a batch size of 32 and without batch norm like in the original paper [CITATION relation net]. For the multi labels however, we used a batch size of 128 and batch norm. This was mainly done due to performance reasons because the multi label approach with more than 3 labels did not learn anything without batch norm at all.
For evaluating our model, we do training runs on eight random 35/5 splits, which include every class once, such that every class is used as an unseen class once. The accuracies however are averaged of the eight individual experiments. For each approach we’re calculating the top-1 accuracy over only the 5 unseen classes (ZSL) and on seen and unseen test data and the harmonic mean, following recent works \cite{jasani2019skeleton} (GZSL). For default and descriptive labels, we train our Network with a batch size of 32 and without batch norm like in the original paper \cite{sung2018learning} For the multi labels however, we used a batch size of 128 and batch norm. This was mainly done due to performance reasons because the multi label approach with more than 3 labels did not learn anything without batch norm at all.
\section{Results}
......@@ -182,6 +182,7 @@ For evaluating our model, we do training runs on 8 random 35/5 splits, which inc
\end{tabular}
\end{center}
\caption{ZSL and GZSL results for different approaches.}
\label{tab:ZSL_GZSL}
\end{table}
\begin{table}
......@@ -198,25 +199,27 @@ For evaluating our model, we do training runs on 8 random 35/5 splits, which inc
\end{tabular}
\end{center}
\caption{Unseen top-1 and top-5 accuracies results in detail.}
\label{tab:top1_top5}
\end{table}
All our results were generated following the procedure described in the Experiments section. In [TABLE] one can see the ZSL accuracies of our approach with standard deviation/min-max. [TABLE] shows the seen accuracy, unseen accuracy and the harmonic mean.
All our results were generated following the procedure described in the Experiments section. In table \ref{tab:ZSL_GZSL}, one can see the ZSL, seen and unseen accuracy and the harmonic mean. Table \ref{tab:top1_top5} shows a more detailed view on the unseen accuracies achieved. It shows the top-1 and top-5 accuracies for our approaches with standard deviation. One can see the baseline results in the line indicated by “Default Labels”. Improvements on the zsl accuracy, the unseen accuracy and the harmonic mean were achieved using the descriptive labels and the three descriptive labels approach. Using only one manually created descriptive label and four automatic augmentations of this description in a multi label approach achieves lower values compared to three descriptive labels, but still improves the unseen performance of using only one descriptive label by 23\%. The seen accuracy is quite similar for all approaches; still, it is slightly higher for the multi labels approaches, which occurred due to the use of batch norm (/always when using batch norm). \\
In the more detailed table one can see, that the top-5 accuracy increases similar to the top-1 accuracies. The decrease however is much smaller when using the automatic augmentation. This behavior was often observed for experiments with the multi label approach. As for the standard deviations, one can see that for the top-1 all based on the descriptive labels are in the same range. For the top-5 accuracies we even get a decrease in standard deviation with higher accuracy values which shows the advantages of the multi label approach.
\subsection{Discussion}
Discussion
One can see the baseline results in the line “Default Labels”. Improvements on the unseen accuracy and harmonic mean were achieved using the descriptive labels and the three descriptive labels approach. Using only one manually created descriptive label and four automatic augmentations of this description in a multi label approach achieves lower unseen accuracies compared to three descriptive labels, but still improves the unseen performance of using only one descriptive label by 0.035. The seen accuracy is quite similar for all approaches. Still, it is slightly higher for the multi labels approaches, which occurred due to the use of batch norm (/always when using batch norm). For the ZSL accuracy in [TABLE] we can observe the same behavior.
\newline
“Why does this work?”
The improvement from the use of descriptive labels over the use of the default labels shows that incorporating more visual information into the semantic embedding by using visual descriptions as class labels helps the network to find a general relation between the semantic and the visual space learned only on the seen training data. Plainly speaking the network finds more similarities between the describing sentences compared to just one-word labels. Usually this should already be solved by using text embedding techniques that were trained on large text corpora to find semantic relationships. But the problem with that is that the texts it was trained on contains the words used to describe motions in many different contexts and usually not visually describing it. The main reason for this is, that most humans don not need e.g. an explanation on what “stand up” is. But for our task the visual relationships are needed which could explain why using descriptive labels leads to improvements.
\newline
For the multi label approach the idea is little bit different. The main motivation here was that using more data is generally a good idea. In Our case the network is forced to learn a more general mapping between the semantic and the visual feature space since the descriptions and therefor also the embeddings change during training randomly. This better generalization on seen training data than helps to better understand and with that classify the unseen samples.
\newline
As described in [Methods] using automatic augmentation methods introduces some kind of variance/diversity into the different embeddings not only focusing on the visual description of the classes and therefore different to the manually created multi labels. This introduced variance/diversity could be modeled as noise. But in contrast to just adding random noise to the embedding vector keep semantic information and relationships. This still helps the network to generalize its mapping. Experiments using only random noise to generate diverse label embeddings lead to no performance improvements in top-1 accuracy.
\subsubsection{From default to descriptive labels}
The improvement from the use of descriptive labels over the use of the default labels shows that incorporating more visual information into the semantic embedding by using visual descriptions as class labels helps the network to find a general relation between the semantic and the visual space learned only on the seen training data. Plainly speaking the network finds more similarities between the describing sentences compared to just one-word labels. Usually this should already be enabled by using text embedding techniques that were trained on large text corpora to find semantic relationships. But the problem with that is that the texts it was trained on contains the words used to describe motions in many different contexts and usually not visually describing it. The main reason for this is, that most humans do not need e.g. an explanation on what “stand up” looks like. But for our task the visual relationships are needed which could explain why using descriptive labels leads to improvements.
\subsubsection{Using multiple labels}
For the multi label approach the idea is little bit different. The main motivation here was that using more data is generally a good idea. In our case the network is forced to learn a more general mapping between the semantic and the visual feature space since the descriptions and therefor also the embeddings change during training randomly. It has to adapt to the greater variance of the used label semantic embeddings. (Here we could insert a TSNE of output features of AN, to show) This better generalization on seen training data then helps to better understand and also classify the unseen samples.
\subsubsection{Automatic augmentation}
As described in Method, using automatic augmentation methods introduces some kind of variance(/diversity) into the different embeddings. As this does not only focus on the visual description of the classes and therefore differs from the manually created multi labels, it could be modeled as noise. But in contrast to just adding random noise to the embedding vector, it keeps semantic information and relationships. This helps the network to generalize its mapping. Experiments using only random noise to generate diverse label embeddings lead to no performance improvements in top-1 accuracy.
\section{Conclusion}
In this work, we showed the (/present a proof of concept of the) importance of the semantic embeddings in the context of skeleton based Zero-Shot Gesture Recognition by using data augmentation of semantic embeddings. By including more visual information in the sentence labels that describe the classes and combining multiple descriptions per class we could improve the model based on [CITATION] by a significant margin. The use of automatic text augmentation methods like [ma2019nlpaug] already reduces the effort of manual annotation significantly, while maintaining most of the performance. Together with a further reduction of the manual annotation effort in the future, data augmentation of the semantic embedding in Zero-Shot Learning could prove useful in optimizing the performance of any Zero-Shot approach.
In this work, we showed the (/present a proof of concept of the) importance of the semantic embeddings in the context of skeleton based Zero-Shot Gesture Recognition by using data augmentation of semantic embeddings. By including more visual information in the sentence labels that describe the classes and combining multiple descriptions per class we could improve the model based on \cite{jasani2019skeleton} by a significant margin. The use of automatic text augmentation methods like \cite{ma2019nlpaug} already reduces the effort of manual annotation significantly, while maintaining most of the performance. Together with a further reduction of the manual annotation effort in the future, data augmentation of the semantic embedding in Zero-Shot Learning could prove useful in optimizing the performance of any Zero-Shot approach.
To achieve this, future works could further investigate the following topics: First, generating sentences from the default labels using methods from Natural Language Processing (NLP) could be implemented to further reduce the manual annotation effort. Second, additional tests on different zero-shot architectures to verify the improvements shown in our work, could be performed. Finally different kinds or combinations of automatic text augmentation methods could be evaluated.
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment