<?xml version="1.0" encoding="utf-8"?>
<raweb xmlns:xlink="http://www.w3.org/1999/xlink" xml:lang="" year="2019">
  <identification id="thoth" isproject="true">
    <shortname>THOTH</shortname>
    <projectName>Learning visual models from large-scale data</projectName>
    <theme-de-recherche>Vision, perception and multimedia interpretation</theme-de-recherche>
    <domaine-de-recherche>Perception, Cognition and Interaction</domaine-de-recherche>
    <urlTeam>http://thoth.inrialpes.fr/</urlTeam>
    <structure_exterieure type="Labs">
      <libelle>Laboratoire Jean Kuntzmann (LJK)</libelle>
    </structure_exterieure>
    <header_dates_team>Creation of the Team: 2016 January 01, updated into Project-Team: 2016 March 01</header_dates_team>
    <LeTypeProjet>Project-Team</LeTypeProjet>
    <keywordsSdN>
      <term>A3.4. - Machine learning and statistics</term>
      <term>A5.3. - Image processing and analysis</term>
      <term>A5.4. - Computer vision</term>
      <term>A5.9. - Signal processing</term>
      <term>A6.2.6. - Optimization</term>
      <term>A8.2. - Optimization</term>
      <term>A9.2. - Machine learning</term>
      <term>A9.3. - Signal analysis</term>
      <term>A9.7. - AI algorithmics</term>
    </keywordsSdN>
    <keywordsSecteurs>
      <term>B5.6. - Robotic systems</term>
      <term>B8.4. - Security and personal assistance</term>
      <term>B8.5. - Smart society</term>
      <term>B9.5.1. - Computer science</term>
      <term>B9.5.6. - Data science</term>
    </keywordsSecteurs>
    <UR name="Grenoble"/>
  </identification>
  <team id="uid1">
    <person key="thoth-2018-idp112112">
      <firstname>Julien</firstname>
      <lastname>Mairal</lastname>
      <categoryPro>Chercheur</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Team leader, Inria, Researcher</moreinfo>
      <hdr>oui</hdr>
    </person>
    <person key="thoth-2018-idp119968">
      <firstname>Cordelia</firstname>
      <lastname>Schmid</lastname>
      <categoryPro>Chercheur</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Inria, Senior Researcher</moreinfo>
      <hdr>oui</hdr>
    </person>
    <person key="thoth-2018-idp115024">
      <firstname>Karteek</firstname>
      <lastname>Alahari</lastname>
      <categoryPro>Chercheur</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Inria, Researcher</moreinfo>
      <hdr>oui</hdr>
    </person>
    <person key="thoth-2018-idp117488">
      <firstname>Grégory</firstname>
      <lastname>Rogez</lastname>
      <categoryPro>Chercheur</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Inria, Starting Research Position, until Jan 2019</moreinfo>
    </person>
    <person key="thoth-2018-idp122832">
      <firstname>Jakob</firstname>
      <lastname>Verbeek</lastname>
      <categoryPro>Chercheur</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Inria, Senior Researcher</moreinfo>
      <hdr>oui</hdr>
    </person>
    <person key="thoth-2019-idp127152">
      <firstname>Jocelyn</firstname>
      <lastname>Chanussot</lastname>
      <categoryPro>Enseignant</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Institut polytechnique de Grenoble, Professor, from Sep 2019</moreinfo>
      <hdr>oui</hdr>
    </person>
    <person key="thoth-2018-idp128160">
      <firstname>Adria</firstname>
      <lastname>Ruiz Ovejero</lastname>
      <categoryPro>PostDoc</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Inria, Post-Doctoral Fellow</moreinfo>
    </person>
    <person key="thoth-2018-idp130624">
      <firstname>Minttu</firstname>
      <lastname>Alakuijala</lastname>
      <categoryPro>PhD</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Inria, PhD Student</moreinfo>
    </person>
    <person key="thoth-2019-idp134960">
      <firstname>Florent</firstname>
      <lastname>Bartoccioni</lastname>
      <categoryPro>PhD</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Inria, PhD Student, from Nov 2019</moreinfo>
    </person>
    <person key="thoth-2018-idp133056">
      <firstname>Alberto</firstname>
      <lastname>Bietti</lastname>
      <categoryPro>PhD</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Inria, PhD Student</moreinfo>
    </person>
    <person key="thoth-2018-idp135488">
      <firstname>Mathilde</firstname>
      <lastname>Caron</lastname>
      <categoryPro>PhD</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Facebook, PhD Student, granted by CIFRE</moreinfo>
    </person>
    <person key="thoth-2018-idp137968">
      <firstname>Dexiong</firstname>
      <lastname>Chen</lastname>
      <categoryPro>PhD</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Univ. Grenoble Alpes, PhD Student</moreinfo>
    </person>
    <person key="thoth-2018-idp142832">
      <firstname>Mikita</firstname>
      <lastname>Dvornik</lastname>
      <categoryPro>PhD</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Inria, PhD Student</moreinfo>
    </person>
    <person key="thoth-2018-idp145264">
      <firstname>Maha</firstname>
      <lastname>Elbayad</lastname>
      <categoryPro>PhD</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Univ Grenoble Alpes, PhD Student</moreinfo>
    </person>
    <person key="thoth-2018-idp147696">
      <firstname>Valentin</firstname>
      <lastname>Gabeur</lastname>
      <categoryPro>PhD</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Inria, PhD Student</moreinfo>
    </person>
    <person key="thoth-2019-idp152048">
      <firstname>Pierre Louis</firstname>
      <lastname>Guhur</lastname>
      <categoryPro>PhD</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Univ Paris-Saclay, PhD Student, from Sep 2019</moreinfo>
    </person>
    <person key="thoth-2018-idp150128">
      <firstname>Yana</firstname>
      <lastname>Hasson</lastname>
      <categoryPro>PhD</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Inria, PhD Student</moreinfo>
    </person>
    <person key="thoth-2018-idp152560">
      <firstname>Ekaterina</firstname>
      <lastname>Iakovleva</lastname>
      <categoryPro>PhD</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Univ Grenoble Alpes, PhD Student</moreinfo>
    </person>
    <person key="thoth-2018-idp154992">
      <firstname>Roman</firstname>
      <lastname>Klokov</lastname>
      <categoryPro>PhD</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Inria, PhD Student</moreinfo>
    </person>
    <person key="thoth-2018-idp157424">
      <firstname>Andrei</firstname>
      <lastname>Kulunchakov</lastname>
      <categoryPro>PhD</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Inria, PhD Student</moreinfo>
    </person>
    <person key="thoth-2019-idp164256">
      <firstname>Bruno</firstname>
      <lastname>Lecouat</lastname>
      <categoryPro>PhD</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Inria, PhD Student, from Sep 2019</moreinfo>
    </person>
    <person key="thoth-2019-idp166704">
      <firstname>Hubert</firstname>
      <lastname>Leterme</lastname>
      <categoryPro>PhD</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Univ Grenoble Alpes, PhD Student, from Sep 2019</moreinfo>
    </person>
    <person key="thoth-2018-idp159856">
      <firstname>Pauline</firstname>
      <lastname>Luc</lastname>
      <categoryPro>PhD</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Facebook CIFRE, PhD Student, until May 2019</moreinfo>
    </person>
    <person key="thoth-2018-idp162304">
      <firstname>Thomas</firstname>
      <lastname>Lucas</lastname>
      <categoryPro>PhD</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Univ Grenoble Alpes, PhD Student</moreinfo>
    </person>
    <person key="thoth-2019-idp174096">
      <firstname>Lina</firstname>
      <lastname>Mezghani</lastname>
      <categoryPro>PhD</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Inria, PhD Student, from Nov 2019</moreinfo>
    </person>
    <person key="thoth-2018-idp164736">
      <firstname>Gregoire</firstname>
      <lastname>Mialon</lastname>
      <categoryPro>PhD</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Inria, PhD Student</moreinfo>
    </person>
    <person key="thoth-2018-idp167168">
      <firstname>Alexander</firstname>
      <lastname>Pashevich</lastname>
      <categoryPro>PhD</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Inria, PhD Student</moreinfo>
    </person>
    <person key="thoth-2018-idp169600">
      <firstname>Alexandre</firstname>
      <lastname>Sablayrolles</lastname>
      <categoryPro>PhD</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Facebook CIFRE, PhD Student</moreinfo>
    </person>
    <person key="thoth-2018-idp172032">
      <firstname>Konstantin</firstname>
      <lastname>Shmelkov</lastname>
      <categoryPro>PhD</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Inria, PhD Student, until Mar 2019</moreinfo>
    </person>
    <person key="willow-2018-idp154880">
      <firstname>Robin</firstname>
      <lastname>Strudel</lastname>
      <categoryPro>PhD</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>École Normale Supérieure de Paris, PhD Student</moreinfo>
    </person>
    <person key="thoth-2018-idp174464">
      <firstname>Vladyslav</firstname>
      <lastname>Sydorov</lastname>
      <categoryPro>PhD</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Inria, PhD Student</moreinfo>
    </person>
    <person key="thoth-2019-idp191200">
      <firstname>Gul</firstname>
      <lastname>Varol Simsekli</lastname>
      <categoryPro>PhD</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Inria, PhD Student, until Feb 2019</moreinfo>
    </person>
    <person key="thoth-2018-idp181776">
      <firstname>Nitika</firstname>
      <lastname>Verma</lastname>
      <categoryPro>PhD</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Univ Grenoble Alpes, PhD Student, until May 2019</moreinfo>
    </person>
    <person key="thoth-2018-idp184208">
      <firstname>Daan</firstname>
      <lastname>Wynen</lastname>
      <categoryPro>PhD</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Inria, PhD Student, until Sep 2019</moreinfo>
    </person>
    <person key="thoth-2019-idp198576">
      <firstname>Houssam</firstname>
      <lastname>Zenati</lastname>
      <categoryPro>PhD</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Criteo, PhD Student, from Oct 2019</moreinfo>
    </person>
    <person key="thoth-2018-idp186640">
      <firstname>Ghislain</firstname>
      <lastname>Durif</lastname>
      <categoryPro>Technique</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Inria, Engineer, until Jan 2019</moreinfo>
    </person>
    <person key="thoth-2019-idp203504">
      <firstname>Ricardo Jose</firstname>
      <lastname>Garcia Pinel</lastname>
      <categoryPro>Technique</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Inria, Engineer, from Jul 2019</moreinfo>
    </person>
    <person key="thoth-2018-idp189104">
      <firstname>François</firstname>
      <lastname>Gindraud</lastname>
      <categoryPro>Technique</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Inria, Engineer, until Mar 2019</moreinfo>
    </person>
    <person key="willow-2018-idp174496">
      <firstname>Igor</firstname>
      <lastname>Kalevatykh</lastname>
      <categoryPro>Technique</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Inria, Engineer, until Nov 2019</moreinfo>
    </person>
    <person key="thoth-2019-idp164256">
      <firstname>Bruno</firstname>
      <lastname>Lecouat</lastname>
      <categoryPro>Technique</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Inria, Engineer, until Aug 2019</moreinfo>
    </person>
    <person key="thoth-2018-idp191568">
      <firstname>Xavier</firstname>
      <lastname>Martin</lastname>
      <categoryPro>Technique</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Inria, Engineer</moreinfo>
    </person>
    <person key="thoth-2019-idp215888">
      <firstname>Alexandre</firstname>
      <lastname>Zouaoui</lastname>
      <categoryPro>Technique</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Inria, Engineer, from Dec 2019</moreinfo>
    </person>
    <person key="thoth-2019-idp152048">
      <firstname>Pierre Louis</firstname>
      <lastname>Guhur</lastname>
      <categoryPro>Stagiaire</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Univ Paris-Saclay, from Apr 2019 until Aug 2019</moreinfo>
    </person>
    <person key="thoth-2019-idp166704">
      <firstname>Hubert</firstname>
      <lastname>Leterme</lastname>
      <categoryPro>Stagiaire</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Univ Grenoble Alpes, from Feb 2019 until Jul 2019</moreinfo>
    </person>
    <person key="thoth-2019-idp223360">
      <firstname>Matthieu</firstname>
      <lastname>Toulemont</lastname>
      <categoryPro>Stagiaire</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>ENPC Paris, from Apr 2019 until Sep 2019</moreinfo>
    </person>
    <person key="thoth-2019-idp198576">
      <firstname>Houssam</firstname>
      <lastname>Zenati</lastname>
      <categoryPro>Stagiaire</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Criteo, from Apr 2019 until Oct 2019</moreinfo>
    </person>
    <person key="thoth-2018-idp213856">
      <firstname>Nathalie</firstname>
      <lastname>Gillot</lastname>
      <categoryPro>Assistant</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Inria, Administrative Assistant</moreinfo>
    </person>
    <person key="thoth-2019-idp230816">
      <firstname>Hernan Dario</firstname>
      <lastname>Benitez Restrepo</lastname>
      <categoryPro>Visiteur</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Pontificia Universidad Javeriana Sede Cali, from Nov 2019</moreinfo>
    </person>
    <person key="thoth-2018-idp216320">
      <firstname>Pia</firstname>
      <lastname>Bideau</lastname>
      <categoryPro>Visiteur</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Univ. Massachusetts Amherst, until Jan 2019</moreinfo>
    </person>
    <person key="thoth-2019-idp235792">
      <firstname>Avijit</firstname>
      <lastname>Dasgupta</lastname>
      <categoryPro>Visiteur</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>IIIT Hyderabad, from Feb 2019 until May 2019</moreinfo>
    </person>
    <person key="thoth-2019-idp238288">
      <firstname>Ning</firstname>
      <lastname>Huyan</lastname>
      <categoryPro>Visiteur</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>from Dec 2019</moreinfo>
    </person>
    <person key="thoth-2019-idp240752">
      <firstname>Dou</firstname>
      <lastname>Quan</lastname>
      <categoryPro>Visiteur</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>Inria, from Dec 2019</moreinfo>
    </person>
    <person key="thoth-2019-idp243184">
      <firstname>Gunnar Atli</firstname>
      <lastname>Sigurdsson</lastname>
      <categoryPro>Visiteur</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>CMU, until Mar 2019</moreinfo>
    </person>
    <person key="thoth-2018-idp186640">
      <firstname>Ghislain</firstname>
      <lastname>Durif</lastname>
      <categoryPro>CollaborateurExterieur</categoryPro>
      <research-centre>Grenoble</research-centre>
      <moreinfo>CNRS, from Feb 2019</moreinfo>
    </person>
  </team>
  <presentation id="uid2">
    <bodyTitle>Overall Objectives</bodyTitle>
    <subsection id="uid3" level="1">
      <bodyTitle>Overall Objectives</bodyTitle>
      <p>In 2021, it is expected that nearly 82% of the Internet
traffic will be due to videos, and that it would take an individual
over 5 million years to watch the amount of video that will cross
global IP networks each month by then. Thus, there is a
pressing and in fact increasing demand to annotate and index this
visual content for home and professional users alike. The available
text and speech-transcript metadata is typically not sufficient by itself for
answering most queries, and visual data must come into play. On the
other hand, it is not imaginable to learn the models of visual content
required to answer these queries by manually and precisely annotating
every relevant concept, object, scene, or action category in a
representative sample of everyday conditions—if only because it may
be difficult, or even impossible to decide a priori what are the
relevant categories and the proper granularity level. This suggests
reverting back to the original metadata as source of annotation,
despite the fact that the information it provides is typically sparse
(e.g., the location and overall topic of newscasts in a video archive)
and noisy (e.g., a movie script may tell us that two persons kiss in
some scene, but not when, and the kiss may occur off screen or not
have survived the final cut). On the other hand, this weak form of
“embedded annotation” is rich and diverse, and mining the
corresponding visual data from the web, TV or film archives guarantees
that it is representative of the many different scene settings
depicted in situations typical of on-line content. Thus, leveraging
this largely untapped source of information, rather than attempting to
hand label all possibly relevant visual data, is a key to the future
use of on-line imagery.</p>
      <p>Today's object recognition and scene understanding technology operates
in a very different setting; it mostly relies on fully supervised
classification engines, and visual models are essentially (piecewise)
rigid templates learned from hand labeled images. The sheer scale of
on-line data and the nature of the embedded annotation call for a
departure from this fully supervised scenario. The main idea of the
Thoth project-team is to develop a new framework for learning
the structure and parameters of visual models by actively exploring
large digital image and video sources (off-line archives as well as
growing on-line content, with millions of images and thousands of hours of video), and
exploiting the weak supervisory signal provided by the accompanying
metadata. This huge volume of visual training data will allow us to
learn complex non-linear models with a large number of parameters,
such as deep convolutional networks and higher-order graphical
models. This is an ambitious goal, given the sheer volume and
intrinsic variability of the visual data available on-line, and the
lack of a universally accepted formalism for modeling it. Yet, the
potential payoff is a breakthrough in visual object recognition and
scene understanding capabilities. Further, recent advances at a
smaller scale suggest that this is realistic. For example, it is
already possible to determine the identity of multiple people from
news images and their captions, or to learn human action models from
video scripts. There has also been recent progress in adapting
supervised machine learning technology to large-scale settings, where
the training data is very large and potentially infinite, and some of
it may not be labeled. Methods that adapt the structure of visual
models to the data are also emerging, and the growing computational
power and storage capacity of modern computers are enabling factors
that should of course not be neglected.</p>
      <p>One of the main objective of Thoth is to transform massive visual data
into trustworthy knowledge libraries. For that, it addresses several challenges.</p>
      <simplelist>
        <li id="uid4">
          <p noindent="true">designing and learning structured models capable of representing complex visual
information.</p>
        </li>
        <li id="uid5">
          <p noindent="true">learning visual models from minimal supervision or unstructured meta-data.</p>
        </li>
        <li id="uid6">
          <p noindent="true">large-scale learning and optimization.</p>
        </li>
      </simplelist>
    </subsection>
  </presentation>
  <fondements id="uid7">
    <bodyTitle>Research Program</bodyTitle>
    <subsection id="uid8" level="1">
      <bodyTitle>Designing and learning structured models</bodyTitle>
      <p>The task of understanding image and video content has been interpreted in
several ways over the past few decades, namely image classification, detecting
objects in a scene, recognizing objects and their spatial extents in an image,
estimating human poses, recovering scene geometry, recognizing activities
performed by humans. However, addressing all these problems individually
provides us with a partial understanding of the scene at best, leaving much of
the visual data unexplained.</p>
      <p>One of the main goals of this research axis is to go beyond the initial
attempts that consider only a subset of tasks jointly, by developing novel
models for a more complete understanding of scenes to address all the
component tasks. We propose to incorporate the structure in image and video
data explicitly into the models. In other words, our models aim to satisfy
the complex sets of constraints that exist in natural images and videos.
Examples of such constraints include: (i) relations between objects, like
signs for shops indicate the presence of buildings, people on a road are
usually walking or standing, (ii) higher-level semantic relations involving
the type of scene, geographic location, and the plausible actions as a
global constraint, e.g., an image taken at a swimming pool is unlikely to
contain cars, (iii) relating objects occluded in some of the video frames to
content in other frames, where they are more clearly visible as the camera
or the object itself move, with the use of long-term trajectories and video
object proposals.</p>
      <p>This research axis will focus on three topics. The first is developing deep
features for video. This involves designing rich features available in the form
of long-range temporal interactions among pixels in a video sequence to learn a
representation that is truly spatio-temporal in nature. The focus of the
second topic is the challenging problem of modeling human activities in video,
starting from human activity descriptors to building intermediate
spatio-temporal representations of videos, and then learning the interactions
among humans, objects and scenes temporally. The last topic is aimed at
learning models that capture the relationships among several objects and
regions in a single image scene, and additionally, among scenes in the case of
an image collection or a video. The main scientific challenges in this topic
stem from learning the structure of the probabilistic graphical model as well
as the parameters of the cost functions quantifying the relationships among its
entities. In the following we will present work related to all these three
topics and then elaborate on our research directions.</p>
      <simplelist>
        <li id="uid9">
          <p noindent="true"><b>Deep features for vision.</b>
Deep learning models provide a rich representation of complex objects
but in return have a large number of parameters. Thus, to work well on
difficult tasks, a large amount of data is
required. In this context, video presents several advantages: objects are
observed from a large range of viewpoints, motion information allows the
extraction of moving objects and parts, and objects can be differentiated by
their motion patterns. We initially plan to develop deep features for
videos that incorporate temporal information at multiple scales. We
then plan to further exploit the rich content in video by
incorporating additional cues, such as the detection of people and
their body-joint locations in video, minimal prior knowledge
of the object of interest, with the goal of learning a
representation that is more appropriate for video understanding. In other
words, a representation that is learned from video data and targeted at
specific applications. For the
application of recognizing human activities, this involves learning deep
features for humans and their body-parts with all their spatiotemporal
variations, either directly from raw video data or “pre-processed” videos
containing human detections. For the application of object tracking, this task
amounts to learning object-specific deep representations, further exploiting
the limited annotation provided to identify the object.</p>
        </li>
        <li id="uid10">
          <p noindent="true"><b>Modeling human activities in videos.</b>
Humans and their activities are not only one of the most frequent and
interesting subjects in videos but also one of the hardest to analyze owing to
the complexity of the human form, clothing and movements. As part of this task,
the Thoth project-team plans to build on state-of-the-art approaches for
spatio-temporal representation of videos.
This will involve using the dominant motion in the scene as well as the local
motion of individual parts undergoing a rigid motion. Such motion information
also helps in reasoning occlusion relationships among people and objects, and
the state of the object. This novel
spatio-temporal representation ultimately provides the equivalent of object
proposals for videos, and is an
important component for learning algorithms using minimal supervision. To take this representation even further, we
aim to integrate the proposals and the occlusion relationships with methods for
estimating human pose in videos, thus leveraging the interplay
among body-joint locations, objects in the scene, and the activity being
performed. For example, the locations of shoulder, elbow and wrist of a person
drinking coffee are constrained to move in a certain way, which is completely
different from the movement observed when a person is typing. In essence, this
step will model human activities by dynamics in terms of both low-level
movements of body-joint locations and global high-level motion in the scene.</p>
        </li>
        <li id="uid11">
          <p noindent="true"><b>Structured models.</b>
The interactions among various elements in a scene, such as, the objects and
regions in it, the motion of object parts or entire objects themselves, form a
key element for understanding image or video content. These rich cues define
the structure of visual data and how it evolves spatio-temporally. We plan to
develop a novel graphical model to exploit this structure. The main components
in this graphical model are spatio-temporal regions (in the case of video or
simply image regions), which can represent object parts or entire objects
themselves, and the interactions among several entities. The dependencies among
the scene entities are defined with a higher order or a global cost function. A
higher order constraint is a generalization of the pairwise interaction term,
and is a cost function involving more than two components in the scene, e.g.,
several regions, whereas a global constraint imposes a cost term over the
entire image or video, e.g., a prior on the number of people expected in the
scene. The constraints we plan to include generalize several existing methods,
which are limited to pairwise interactions or a small
restrictive set of higher-order costs. In addition to learning
the parameters of these novel functions, we
will focus on learning the structure of the graph itself—a challenging
problem that is seldom addressed in current approaches. This provides an
elegant way to go beyond state-of-the-art deep learning methods, which are
limited to learning the high-level interaction among parts of an object, by
learning the relationships among objects.</p>
        </li>
      </simplelist>
    </subsection>
    <subsection id="uid12" level="1">
      <bodyTitle>Learning of visual models from minimal supervision</bodyTitle>
      <p>Today's approaches to visual recognition learn models for a limited and fixed
set of visual categories with fully supervised classification techniques.
This paradigm has been adopted in the early 2000's, and within it enormous
progress has been made over the last decade.</p>
      <p>The scale and diversity in today's large and growing image and video
collections (such as, e.g., broadcast archives, and personal image/video
collections) call for a departure from the current paradigm. This is the
case because to answer queries about such data, it is unfeasible to learn
the models of visual content by manually and precisely annotating every
relevant concept, object, scene, or action category in a representative
sample of everyday conditions. For one, it will be difficult, or even
impossible to decide a-priori what are the relevant categories and the
proper granularity level. Moreover, the cost of such annotations would be
prohibitive in most application scenarios. One of the main goals of the
Thoth project-team is to develop a new framework for learning visual
recognition models by actively exploring large digital image and video
sources (off-line archives as well as growing on-line content), and
exploiting the weak supervisory signal provided by the accompanying metadata
(such as captions, keywords, tags, subtitles, or scripts) and audio signal
(from which we can for example extract speech transcripts, or exploit
speaker recognition models).</p>
      <p>Textual metadata has traditionally been used to index and search for visual
content. The information in metadata is, however, typically sparse (e.g.,
the location and overall topic of newscasts in a video archive <footnote id="uid13" id-text="1">For
example at the Dutch national broadcast archive Netherlands Institute of
Sound and Vision, with whom we collaborated in the EU FP7 project AXES,
typically one or two sentences are used in the metadata to describe a one
hour long TV program.</footnote>) and noisy (e.g., a movie script may tell us that two
persons kiss in some scene, but not when, and the kiss may occur off screen
or not have survived the final cut). For this reason, metadata search
should be complemented by visual content based search, where visual
recognition models are used to localize content of interest that is not
mentioned in the metadata, to increase the usability and value of
image/video archives. <i>The key insight that we build on in this
research axis is that while the metadata for a single image or video is too
sparse and noisy to rely on for search, the metadata associated with large
video and image databases collectively provide an extremely versatile
source of information to learn visual recognition models. </i> This form of
“embedded annotation” is rich, diverse and abundantly available. Mining
these correspondences from the web, TV and film archives, and online
consumer generated content sites such as Flickr, Facebook, or YouTube,
guarantees that the learned models are representative for many different
situations, unlike models learned from manually collected fully supervised
training data sets which are often biased.</p>
      <p>The approach we propose to address the limitations of the fully supervised
learning paradigm aligns with “Big Data” approaches developed in other
areas: we rely on the orders-of-magnitude-larger training sets that have
recently become available with metadata to compensate for less explicit
forms of supervision. This will form a sustainable approach to learn visual
recognition models for a much larger set of categories with little or no
manual intervention. Reducing and ultimately removing the dependency on
manual annotations will dramatically reduce the cost of learning visual
recognition models. This in turn will allow such models to be used in many
more applications, and enable new applications based on visual recognition
beyond a fixed set of categories, such as natural language based querying
for visual content. This is an ambitious goal, given the sheer volume and
intrinsic variability of the every day visual content available on-line, and
the lack of a universally accepted formalism for modeling it. Yet, the
potential payoff is a breakthrough in visual object recognition and scene
understanding capabilities.</p>
      <p>This research axis is organized into the following three sub-tasks:</p>
      <simplelist>
        <li id="uid14">
          <p noindent="true"><b>Weakly supervised learning.</b> For object localization we will
go beyond current methods that learn one category model at a time and
develop methods that learn models for different categories
concurrently. This allows “explaining away” effects to be leveraged,
i.e., if a certain region in an image has been identified as an
instance of one category, it cannot be an instance of another category
at the same time. For weakly supervised detection in video we will
consider detection proposal methods. While these are effective for
still images, recent approaches for the spatio-temporal domain need
further improvements to be similarly effective. Furthermore, we will
exploit appearance and motion information jointly over a set of
videos. In the video domain we will also continue to work on learning
recognition models from subtitle and script information. The basis of
leveraging the script data which does not have a temporal alignment
with the video, is to use matches in the narrative in the script and
the subtitles (which do have a temporal alignment with the video). We
will go beyond simple correspondences between names and verbs relating
to self-motion, and match more complex sentences related to
interaction with objects and other people. To deal with the limited
amount of occurrences of such actions in a single movie, we will
consider approaches that learn action models across a collection of
movies.</p>
        </li>
        <li id="uid15">
          <p noindent="true"><b>Online learning of visual models.</b> As a larger number of
visual category models is being learned, online learning methods
become important, since new training data and categories will arrive
over time. We will develop online learning methods that can
incorporate new examples for existing category models, and learn new
category models from few examples by leveraging similarity to related
categories using multi-task learning methods. Here we will develop
new distance-based classifiers and attribute and label embedding
techniques, and explore the use of NLP techniques such as skipgram
models to automatically determine between which classes transfer
should occur. Moreover, NLP will be useful in the context of learning
models for many categories to identify synonyms, and to determine
cases of polysemy (e.g. jaguar car brand v.s. jaguar animal), and
merge or refine categories accordingly. Ultimately this will result
in methods that are able to learn an“encyclopedia” of visual
models.</p>
        </li>
        <li id="uid16">
          <p noindent="true"><b>Visual search from unstructured textual queries.</b> We will
build on recent approaches that learn recognition models on-the-fly
(as the query is issued) from generic image search engines such as
Google Images. While it is feasible to learn models in this manner in
a matter of seconds, it is challenging to use the model to retrieve
relevant content in real-time from large video archives of more than a
few thousand hours. To achieve this requires feature compression
techniques to store visual representations in memory, and cascaded
search techniques to avoid exhaustive search. This approach, however,
leaves untouched the core problem of how to associate visual material
with the textual query in the first place.
The second approach we will explore is based on image annotation
models. In particular we will go beyond image-text retrieval methods
by using recurrent neural networks such as Elman networks or long
short-term memory (LSTM) networks to generate natural language
sentences to describe images.</p>
        </li>
      </simplelist>
    </subsection>
    <subsection id="uid17" level="1">
      <bodyTitle>Large-scale learning and optimization</bodyTitle>
      <p>We have entered an era of massive data acquisition, leading to the revival of
an old scientific utopia: it should be possible to better understand the world
by automatically converting data into knowledge. It is also leading to a new
economic paradigm, where data is a valuable asset and a source of activity.
Therefore, developing scalable technology to make sense of massive data has
become a strategic issue. Computer vision has already started to adapt to
these changes.</p>
      <p>In particular, very high dimensional models such as deep networks
are becoming highly popular and successful for visual recognition.
This change is closely related to the advent of big data. On the one hand,
these models involve a huge number of parameters and are rich enough to
represent well complex objects such as natural images or text corpora. On the
other hand, they are prone to overfitting (fitting too closely to training data
without being able to generalize to new unseen data) despite
regularization; to
work well on difficult tasks, they require a large amount of labelled data that
has been available only recently. Other cues may explain their success: the
deep learning community has made significant engineering efforts, making it
possible to learn in a day on a GPU large models that would have required weeks
of computations on a traditional CPU, and it
has accumulated enough empirical experience to find good hyper-parameters for
its networks.</p>
      <p>To learn the huge number of parameters of deep
hierarchical models requires scalable optimization
techniques and large amounts of data to prevent overfitting. This
immediately raises two major challenges: how to learn without large
amounts of labeled data, or with weakly supervised annotations? How to
efficiently learn such huge-dimensional models?
To answer the above challenges, we will concentrate on the design and
theoretical justifications of deep architectures including our recently
proposed deep kernel machines, with a focus on weakly supervised and
unsupervised learning, and develop continuous and discrete optimization
techniques that push the state of the art in terms of speed and scalability.</p>
      <p>This research axis will be developed into three sub-tasks:</p>
      <simplelist>
        <li id="uid18">
          <p noindent="true"><b>Deep kernel machines for structured data.</b>
Deep kernel machines combine advantages of kernel methods
and deep learning. Both approaches rely on high-dimensional models.
Kernels implicitly operate in a space of possibly infinite dimension,
whereas deep networks explicitly construct high-dimensional nonlinear
data representations. Yet, these approaches are complementary: Kernels
can be built with deep learning principles such as hierarchies and
convolutions, and approximated by multilayer neural networks.
Furthermore, kernels work with structured data and have well
understood theoretical principles. Thus, a goal of the Thoth
project-team is to design and optimize the training of such deep
kernel machines.</p>
        </li>
        <li id="uid19">
          <p noindent="true"><b>Large-scale parallel optimization.</b>
Deep kernel machines produce nonlinear representations of input
data points. After encoding these data points, a learning task is often
formulated as a <i>large-scale convex optimization problem</i>; for example,
this is the case for linear support vector machines, logistic
regression classifiers, or more generally many empirical risk
minimization formulations. We intend to pursue recent efforts for
making convex optimization techniques that are dedicated to machine
learning more scalable. Most existing approaches address scalability
issues either in model size (meaning that the function to minimize is
defined on a domain of very high dimension), or in the amount of
training data (typically, the objective is a large sum of elementary
functions). There is thus a large room for improvements for techniques
that jointly take these two criteria into account.</p>
        </li>
        <li id="uid20">
          <p noindent="true"><b>Large-scale graphical models.</b>
To represent structured data, we will also investigate graphical models and
their optimization. The challenge here is two-fold:
designing an adequate cost function and minimizing it.
While several cost functions are possible, their utility will be largely
determined by the efficiency and the effectiveness of the optimization
algorithms for solving them. It is a combinatorial optimization problem
involving billions of variables and is NP-hard in general, requiring us to go
beyond the classical approximate inference techniques. The main challenges in
minimizing cost functions stem from the large number of variables to be
inferred, the inherent structure of the graph induced by the interaction terms
(e.g., pairwise terms), and the high-arity terms which constrain multiple
entities in a graph.</p>
        </li>
      </simplelist>
    </subsection>
    <subsection id="uid21" level="1">
      <bodyTitle>Datasets and evaluation</bodyTitle>
      <p>Standard benchmarks with associated evaluation measures are becoming
increasingly important in computer vision, as they enable an
objective comparison of state-of-the-art approaches.
Such datasets need to be relevant for real-world application scenarios;
challenging for state-of-the-art algorithms; and
large enough to produce statistically significant results.</p>
      <p>A decade ago, small datasets were used to evaluate relatively simple tasks, such
as for example interest point matching and detection. Since then, the size
of the datasets and the complexity of the tasks gradually evolved. An
example is the Pascal Visual Object Challenge with 20 classes
and approximately 10,000 images, which evaluates object classification and
detection. Another example is the ImageNet challenge, including thousands
of classes and millions of images. In the context of video classification,
the TrecVid Multimedia Event Detection challenges, organized by NIST,
evaluate activity classification on a dataset of over 200,000 video clips,
representing more than 8,000 hours of video, which amounts to 11 months of
continuous video.</p>
      <p>Almost all of the existing image and video datasets are annotated by hand;
it is the case for all of the above cited examples. In some cases, they
present limited and unrealistic viewing conditions. For example, many images
of the ImageNet dataset depict upright objects with virtually no background
clutter, and they may not capture particularly relevant visual concepts:
most people would not know the majority of subcategories of snakes cataloged
in ImageNet. This holds true for video datasets as well, where in addition
a taxonomy of action and event categories is missing.</p>
      <p>Our effort on data collection and evaluation will focus on two directions. First,
we will design and assemble video datasets, in particular for action and
activity recognition. This includes defining relevant taxonomies of
actions and activities. Second, we will provide data and
define evaluation protocols for weakly supervised learning methods. This does not mean of
course that we will forsake human supervision altogether: some amount
of ground-truth labeling is necessary for experimental validation and
comparison to the state of the art. Particular attention will be payed
to the design of efficient annotation tools.</p>
      <p>Not only do we plan to collect datasets, but also to provide them to the
community, together with accompanying evaluation protocols and software, to
enable a comparison of competing approaches for action recognition and
large-scale weakly supervised learning. Furthermore, we plan to set up
evaluation servers together with leader-boards, to establish an unbiased
state of the art on held out test data for which the ground-truth
annotations are not distributed. This is crucial to avoid tuning the
parameters for a specific dataset and to guarantee a fair evaluation.</p>
      <simplelist>
        <li id="uid22">
          <p noindent="true"><b>Action recognition.</b> We will develop datasets for recognizing
human actions and human-object interactions (including multiple
persons) with a significant number of actions. Almost all of today's
action recognition datasets evaluate classification of short video
clips into a number of predefined categories, in many cases a number
of different sports, which are relatively easy to identify by their
characteristic motion and context. However, in many real-world
applications the goal is to identify and localize actions in entire
videos, such as movies or surveillance videos of several hours. The
actions targeted here are “real-world” and will be defined by
compositions of atomic actions into higher-level activities. One
essential component is the definition of relevant taxonomies of
actions and activities. We think that such a definition needs to rely
on a decomposition of actions into poses, objects and scenes, as
determining all possible actions without such a
decomposition is not feasible. We plan to provide annotations for
spatio-temporal localization of humans as well as relevant objects and
scene parts for a large number of actions and videos.</p>
        </li>
        <li id="uid23">
          <p noindent="true"><b>Weakly supervised learning.</b> We will collect weakly labeled
images and videos for training. The collection process will be
semi-automatic. We will use image or video search engines such as
Google Image Search, Flickr or YouTube to find visual data
corresponding to the labels. Initial datasets will be obtained by
manually correcting whole-image/video labels, i.e., the approach will
evaluate how well the object model can be learned if the entire image
or video is labeled, but the object model has to be extracted
automatically. Subsequent datasets will features noisy and incorrect
labels. Testing will be performed on PASCAL VOC'07 and ImageNet, but
also on more realistic datasets similar to those used for training,
which we develop and manually annotate for evaluation. Our dataset
will include both images and videos, the categories represented will
include objects, scenes as well as human activities, and the data will
be presented in realistic conditions.</p>
        </li>
        <li id="uid24">
          <p noindent="true"><b>Joint learning from visual information and text.</b> Initially, we
will use a selection from the large number of movies and TV series for
which scripts are available on-line, see for
example <ref xlink:href="http://www.dailyscript.com" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">http://<allowbreak/>www.<allowbreak/>dailyscript.<allowbreak/>com</ref> and
<ref xlink:href="http://www.weeklyscript.com" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">http://<allowbreak/>www.<allowbreak/>weeklyscript.<allowbreak/>com</ref>. These scripts can easily be aligned with
the videos by establishing correspondences between script words and
(timestamped) spoken ones obtained from the subtitles or audio track.
The goal is to jointly learn from visual content and text. To measure
the quality of such a joint learning, we will manually annotate some
of the videos. Annotations will include the space-time locations of
the actions as well as correct parsing of the sentence. While DVDs
will, initially, receive most attention, we will also investigate the
use of data obtained from web pages, for example images with captions,
or images and videos surrounded by text. This data is by nature more
noisy than scripts.</p>
        </li>
      </simplelist>
    </subsection>
  </fondements>
  <domaine id="uid25">
    <bodyTitle>Application Domains</bodyTitle>
    <subsection id="uid26" level="1">
      <bodyTitle>Visual applications</bodyTitle>
      <p>Any solution to automatically understanding images and videos on a
semantic level will have an immediate impact on a wide range of
applications. For example:</p>
      <simplelist>
        <li id="uid27">
          <p noindent="true">Semantic-level image and video access is highly relevant for visual
search on the Web, in professional archives and personal
collections.</p>
        </li>
        <li id="uid28">
          <p noindent="true">Visual data organization is applicable to organizing family photo
and video albums as well as to large-scale information retrieval.</p>
        </li>
        <li id="uid29">
          <p noindent="true">Visual object recognition has potential applications ranging from
surveillance, service robotics for assistance in day-to-day
activities as well as the medical domain.</p>
        </li>
        <li id="uid30">
          <p noindent="true">Action recognition is highly relevant to visual surveillance,
assisted driving and video access.</p>
        </li>
        <li id="uid31">
          <p noindent="true">Real-time scene understanding is relevant for human interaction
through devices such as HoloLens, Oculus Rift.</p>
        </li>
      </simplelist>
    </subsection>
    <subsection id="uid32" level="1">
      <bodyTitle>Pluri-disciplinary research</bodyTitle>
      <p>Machine learning is intrinsically pluri-disciplinary. By developing large-scale
machine learning models and algorithms for processing data, the Thoth team
became naturally involved in pluri-disciplinary collaborations that go beyond visual
modelling. In particular,</p>
      <simplelist>
        <li id="uid33">
          <p noindent="true">extensions of unsupervised learning techniques originally developed for modelling the statistics of natural images have been deployed in neuro-imaging for fMRI data with the collaboration of the Parietal team from Inria.</p>
        </li>
        <li id="uid34">
          <p noindent="true">similarly, deep convolutional data representations, also originally developed for visual data, have been successfully extended to the processing of biological sequences, with collaborators from bio-informatics.</p>
        </li>
        <li id="uid35">
          <p noindent="true">Thoth also collaborates with experts in natural language and text processing, for applications where visual modalities need to be combined with text data.</p>
        </li>
      </simplelist>
    </subsection>
  </domaine>
  <highlights id="uid36">
    <bodyTitle>Highlights of the Year</bodyTitle>
    <subsection id="uid37" level="1">
      <bodyTitle>Highlights of the Year</bodyTitle>
      <subsection id="uid38" level="2">
        <bodyTitle>Awards</bodyTitle>
        <simplelist>
          <li id="uid39">
            <p noindent="true">Cordelia Schmid received the Royal Society Milner Award, 2019.</p>
          </li>
          <li id="uid40">
            <p noindent="true">Julien Mairal received the test-of-time award at the International Conference on Machine Learning (ICML), 2019.</p>
          </li>
          <li id="uid41">
            <p noindent="true">The paper <ref xlink:href="#thoth-2019-bid0" location="biblio" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/> authored by Roman Klokov, Jakob Verbeek, Edmond Boyer [Inria Morpheo] won the “Best Science Paper Award Honourable Mention” at BMVC 2019.</p>
          </li>
          <li id="uid42">
            <p noindent="true">Jakob Verbeek was awarded as an outstanding reviewer at ICLR 2019.</p>
          </li>
          <li id="uid43">
            <p noindent="true">Adria Ruiz Ovejero was awarded as an outstanding reviewer at ICCV 2019.</p>
          </li>
        </simplelist>
      </subsection>
      <subsection id="uid44" level="2">
        <bodyTitle>Dissemination</bodyTitle>
        <simplelist>
          <li id="uid45">
            <p noindent="true">The team co-organized PAISS 2019, an international AI summer school
in Paris. This is the second edition of the school that was first organized in
Grenoble in 2018. The 2019 edition brought together over 200 participants. We
also provided scholarships to 21 students to encourage diversity among the
attendees.</p>
          </li>
        </simplelist>
      </subsection>
    </subsection>
  </highlights>
  <logiciels id="uid46">
    <bodyTitle>New Software and Platforms</bodyTitle>
    <subsection id="uid47" level="1">
      <bodyTitle>LCR-Net</bodyTitle>
      <p>
        <i>Localization-Classification-Regression Network for Human Pose</i>
      </p>
      <p><span class="smallcap" align="left">Keywords:</span> Object detection - Recognition of human movement</p>
      <p><span class="smallcap" align="left">Functional Description:</span> We propose an end-to-end architecture for joint 2D and 3D human pose estimation in natural images. Key to our approach is the generation and scoring of a number of pose proposals per image, which allows us to predict 2D and 3D pose of multiple people simultaneously.
Our architecture contains 3 main components: 1) the pose proposal generator that suggests potential poses at different locations in the image, 2) a classifier that scores the different pose proposals , and 3) a regressor that refines pose proposals both in 2D and 3D.</p>
      <simplelist>
        <li id="uid48">
          <p noindent="true">Participants: Grégory Rogez, Philippe Weinzaepfel and Cordelia Schmid</p>
        </li>
        <li id="uid49">
          <p noindent="true">Partner: Naver Labs Europe</p>
        </li>
        <li id="uid50">
          <p noindent="true">Contact: Nicolas Jourdan</p>
        </li>
        <li id="uid51">
          <p noindent="true">Publication: <ref xlink:href="https://hal.inria.fr/hal-01505085" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">LCR-Net: Localization-Classification-Regression for Human Pose</ref></p>
        </li>
        <li id="uid52">
          <p noindent="true">URL: <ref xlink:href="https://thoth.inrialpes.fr/src/LCR-Net/" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>thoth.<allowbreak/>inrialpes.<allowbreak/>fr/<allowbreak/>src/<allowbreak/>LCR-Net/</ref></p>
        </li>
      </simplelist>
    </subsection>
    <subsection id="uid53" level="1">
      <bodyTitle>CKN-seq</bodyTitle>
      <p>
        <i>Convolutional Kernel Networks for Biological Sequences</i>
      </p>
      <p><span class="smallcap" align="left">Keyword:</span> Bioinformatics</p>
      <p><span class="smallcap" align="left">Scientific Description:</span> The growing amount of biological sequences available makes it possible to learn genotype-phenotype relationships from data with increasingly high accuracy. By exploiting large sets of sequences with known phenotypes, machine learning methods can be used to build functions that predict the phenotype of new, unannotated sequences. In particular, deep neural networks have recently obtained good performances on such prediction tasks, but are notoriously difficult to analyze or interpret. Here, we introduce a hybrid approach between kernel methods and convolutional neural networks for sequences, which retains the ability of neural networks to learn good representations for a learning problem at hand, while defining a well characterized Hilbert space to describe prediction functions. Our method outperforms state-of-the-art convolutional neural networks on a transcription factor binding prediction task while being much faster to train and yielding more stable and interpretable results.</p>
      <p><span class="smallcap" align="left">Functional Description:</span> D. Chen, L. Jacob, and J. Mairal. Biological Sequence Modeling with Convolutional Kernel Networks. Bioinformatics, volume 35, issue 18, pages 3294-3302, 2019.</p>
      <simplelist>
        <li id="uid54">
          <p noindent="true">Participants: Laurent Jacob, Dexiong Chen and Julien Mairal</p>
        </li>
        <li id="uid55">
          <p noindent="true">Partners: CNRS - UGA</p>
        </li>
        <li id="uid56">
          <p noindent="true">Contact: Julien Mairal</p>
        </li>
        <li id="uid57">
          <p noindent="true">Publication: <ref xlink:href="https://hal.inria.fr/hal-01632912" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">Biological Sequence Modeling with Convolutional Kernel Networks</ref></p>
        </li>
        <li id="uid58">
          <p noindent="true">URL: <ref xlink:href="https://gitlab.inria.fr/dchen/CKN-seq" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>gitlab.<allowbreak/>inria.<allowbreak/>fr/<allowbreak/>dchen/<allowbreak/>CKN-seq</ref></p>
        </li>
      </simplelist>
    </subsection>
    <subsection id="uid59" level="1">
      <bodyTitle>LVO</bodyTitle>
      <p>
        <i>Learning Video Object Segmentation with Visual Memory</i>
      </p>
      <p><span class="smallcap" align="left">Keyword:</span> Video analysis</p>
      <p><span class="smallcap" align="left">Functional Description:</span> This is a public implementation of the method described in the following paper: Learning Video Object Segmentation with Visual Memory [ICCV 2017] (https://hal.archives-ouvertes.fr/hal-01511145v2/document).</p>
      <p>This paper addresses the task of segmenting moving objects in unconstrained videos. We introduce a novel two-stream neural network with an explicit memory module to achieve this. The two streams of the network encode spatial and temporal features in a video sequence respectively, while the memory module captures the evolution of objects over time. The module to build a "visual memory" in video, i.e., a joint representation of all the video frames, is realized with a convolutional recurrent unit learned from a small number of training video sequences. Given a video frame as input, our approach assigns each pixel an object or background label based on the learned spatio-temporal features as well as the "visual memory" specific to the video, acquired automatically without any manually-annotated frames. The visual memory is implemented with convolutional gated recurrent units, which allows to propagate spatial information over time. We evaluate our method extensively on two benchmarks, DAVIS and Freiburg-Berkeley motion segmentation datasets, and show state-of-the-art results. For example, our approach outperforms the top method on the DAVIS dataset by nearly 6%. We also provide an extensive ablative analysis to investigate the influence of each component in the proposed framework.</p>
      <simplelist>
        <li id="uid60">
          <p noindent="true">Participants: Karteek Alahari, Cordelia Schmid and Pavel Tokmakov</p>
        </li>
        <li id="uid61">
          <p noindent="true">Contact: Pavel Tokmakov</p>
        </li>
        <li id="uid62">
          <p noindent="true">Publication: <ref xlink:href="https://hal.inria.fr/hal-01511145v2" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">hal-01511145v2</ref></p>
        </li>
        <li id="uid63">
          <p noindent="true">URL: <ref xlink:href="http://lear.inrialpes.fr/research/lvo/" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">http://<allowbreak/>lear.<allowbreak/>inrialpes.<allowbreak/>fr/<allowbreak/>research/<allowbreak/>lvo/</ref></p>
        </li>
      </simplelist>
    </subsection>
    <subsection id="uid64" level="1">
      <bodyTitle>SURREAL</bodyTitle>
      <p>
        <i>Learning from Synthetic Humans</i>
      </p>
      <p><span class="smallcap" align="left">Keywords:</span> Synthetic human - Segmentation - Neural networks</p>
      <p><span class="smallcap" align="left">Functional Description:</span> The SURREAL dataset consisting of synthetic videos of humans, and models trained on this dataset are released in this package. The code for rendering synthetic images of people and for training models is also included in the release.</p>
      <simplelist>
        <li id="uid65">
          <p noindent="true">Participants: Gül Varol, Xavier Martin, Ivan Laptev and Cordelia Schmid</p>
        </li>
        <li id="uid66">
          <p noindent="true">Contact: Gül Varol</p>
        </li>
        <li id="uid67">
          <p noindent="true">Publication: <ref xlink:href="https://hal.inria.fr/hal-01505711" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">Learning from Synthetic Humans</ref></p>
        </li>
        <li id="uid68">
          <p noindent="true">URL: <ref xlink:href="http://www.di.ens.fr/willow/research/surreal/" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">http://<allowbreak/>www.<allowbreak/>di.<allowbreak/>ens.<allowbreak/>fr/<allowbreak/>willow/<allowbreak/>research/<allowbreak/>surreal/</ref></p>
        </li>
      </simplelist>
    </subsection>
    <subsection id="uid69" level="1">
      <bodyTitle>attn2d</bodyTitle>
      <p>
        <i>Pervasive Attention</i>
      </p>
      <p><span class="smallcap" align="left">Keywords:</span> NLP - Deep learning - Machine translation</p>
      <p><span class="smallcap" align="left">Scientific Description:</span> Pervasive attention : 2D Convolutional Networks for Sequence-to-Sequence Prediction</p>
      <p><span class="smallcap" align="left">Functional Description:</span> An open source PyTorch implementation of the pervasive attention model described in:
Maha Elbayad, Laurent Besacier, and Jakob Verbeek. 2018. Pervasive Attention: 2D Convolutional Networks for Sequence-to-Sequence Prediction. In Proceedings of the 22nd Conference on Computational Natural Language Learning (CoNLL 2018)</p>
      <simplelist>
        <li id="uid70">
          <p noindent="true">Participants: Maha Elbayad and Jakob Verbeek</p>
        </li>
        <li id="uid71">
          <p noindent="true">Contact: Maha Elbayad</p>
        </li>
        <li id="uid72">
          <p noindent="true">Publication: <ref xlink:href="https://hal.inria.fr/hal-01851612" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">Pervasive Attention: 2D Convolutional Neural Networks for Sequence-to-Sequence Prediction</ref></p>
        </li>
        <li id="uid73">
          <p noindent="true">URL: <ref xlink:href="https://github.com/elbayadm/attn2d" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>github.<allowbreak/>com/<allowbreak/>elbayadm/<allowbreak/>attn2d</ref></p>
        </li>
      </simplelist>
    </subsection>
    <subsection id="uid74" level="1">
      <bodyTitle>Cyanure</bodyTitle>
      <p>
        <i>Cyanure: An Open-Source Toolbox for Empirical Risk Minimization</i>
      </p>
      <p><span class="smallcap" align="left">Keyword:</span> Machine learning</p>
      <p><span class="smallcap" align="left">Functional Description:</span> Cyanure is an open-source C++ software package with a Python interface. The goal of Arsenic is to provide state-of-the-art solvers for learning linear models, based on stochastic variance-reduced stochastic optimization with acceleration mechanisms and Quasi-Newton principles. Arsenic can handle a large variety of loss functions (logistic, square, squared hinge, multinomial logistic) and regularization functions (l2, l1, elastic-net, fused Lasso, multi-task group Lasso). It provides a simple Python API, which is very close to that of scikit-learn, which should be extended to other languages such as R or Matlab in a near future.</p>
      <p><span class="smallcap" align="left">Release Functional Description:</span> version initiale</p>
      <simplelist>
        <li id="uid75">
          <p noindent="true">Participant: Julien Mairal</p>
        </li>
        <li id="uid76">
          <p noindent="true">Contact: Julien Mairal</p>
        </li>
        <li id="uid77">
          <p noindent="true">URL: <ref xlink:href="http://thoth.inrialpes.fr/people/mairal/arsenic/welcome.html" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">http://<allowbreak/>thoth.<allowbreak/>inrialpes.<allowbreak/>fr/<allowbreak/>people/<allowbreak/>mairal/<allowbreak/>arsenic/<allowbreak/>welcome.<allowbreak/>html</ref></p>
        </li>
      </simplelist>
    </subsection>
  </logiciels>
  <resultats id="uid78">
    <bodyTitle>New Results</bodyTitle>
    <subsection id="uid79" level="1">
      <bodyTitle>Visual Recognition and Robotics</bodyTitle>
      <object id="uid80">
        <table>
          <tr>
            <td>
              <ressource xlink:href="IMG/aruiz1.png" type="figure" width="320.25pt" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest" media="WEB"/>
            </td>
          </tr>
        </table>
        <caption>Illustration of different reference-based disentangling problems. (a) Disentangling style from digits. The reference distribution is composed by numbers with a fixed style (b) Disentangling factors of variations related with facial expressions. Reference images correspond to neutral faces. Note that pairing information between unlabelled and reference images is not available during training.</caption>
      </object>
      <subsection id="uid81" level="2">
        <bodyTitle>Learning Disentangled Representations with Reference-Based Variational Autoencoders</bodyTitle>
        <participants>
          <person key="PASUSERID">
            <firstname>Adria</firstname>
            <lastname>Ruiz</lastname>
          </person>
          <person key="PASUSERID">
            <firstname>Oriol</firstname>
            <lastname>Martinez</lastname>
          </person>
          <person key="PASUSERID">
            <firstname>Xavier</firstname>
            <lastname>Binefa</lastname>
          </person>
          <person key="thoth-2018-idp122832">
            <firstname>Jakob</firstname>
            <lastname>Verbeek</lastname>
          </person>
        </participants>
        <p>Learning disentangled representations from visual data, where different high-level generative factors are independently encoded, is of importance for many computer vision tasks. Supervised approaches, however, require a significant annotation effort in order to label the factors of interest in a training set. To alleviate the annotation cost, in <ref xlink:href="#thoth-2019-bid1" location="biblio" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/> we introduce a learning setting which we refer to as “reference-based disentangling”. Given a pool of unlabelled images, the goal is to learn a representation where a set of target factors are disentangled from others.
The only supervision comes from an auxiliary “reference set” that contains images where the factors of interest are constant. See Fig. <ref xlink:href="#uid80" location="intern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/> for illustrative examples. In order to address this problem, we propose reference-based variational autoencoders, a novel deep generative model designed to exploit the weak supervisory signal provided by the reference set. During training, we use the variational inference framework where adversarial learning is used to minimize the objective function. By addressing tasks such as feature learning, conditional image generation or attribute transfer, we validate the ability of the proposed model to learn disentangled representations from minimal supervision.</p>
      </subsection>
      <subsection id="uid82" level="2">
        <bodyTitle>Tensor Decomposition and Non-linear Manifold Modeling for 3D Head Pose Estimation</bodyTitle>
        <participants>
          <person key="PASUSERID">
            <firstname>Dmytro</firstname>
            <lastname>Derkach</lastname>
          </person>
          <person key="PASUSERID">
            <firstname>Adria</firstname>
            <lastname>Ruiz</lastname>
          </person>
          <person key="PASUSERID">
            <firstname>Federico M.</firstname>
            <lastname>Sukno</lastname>
          </person>
        </participants>
        <p>Head pose estimation is a challenging computer vision problem with important applications in different scenarios such as human-computer interaction or face recognition. In <ref xlink:href="#thoth-2019-bid2" location="biblio" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/>, we present a 3D head pose estimation algorithm based on non-linear manifold learning. A key feature of the proposed approach is that it allows modeling the underlying 3D manifold that results from the combination of rotation angles. To do so, we use tensor decomposition to generate separate subspaces for each variation factor and show that each of them has a clear structure that can be modeled with cosine functions from a unique shared parameter per angle (see Fig. <ref xlink:href="#uid83" location="intern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/>). Such representation provides a deep understanding of data behavior. We show that the proposed framework can be applied to a wide variety of input features and can be used for different purposes. Firstly, we test our system on a publicly available database, which consists of 2D images and we show that the cosine functions can be used to synthesize rotated versions from an object from which we see only a 2D image at a specific angle. Further, we perform 3D head pose estimation experiments using other two types of features: automatic landmarks and histogram-based 3D descriptors. We evaluate our approach on two publicly available databases, and demonstrate that angle estimations can be performed by optimizing the combination of these cosine functions to achieve state-of-the-art performance.</p>
        <object id="uid83">
          <table>
            <tr>
              <td>
                <ressource xlink:href="IMG/aruiz2.png" type="figure" width="256.2026pt" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest" media="WEB"/>
              </td>
            </tr>
          </table>
          <caption>Visualization of the first three coefficients of the pose variation subspace for a dataset of single object rotated about the vertical axis.</caption>
        </object>
      </subsection>
      <subsection id="uid84" level="2">
        <bodyTitle>Spreading vectors for similarity search</bodyTitle>
        <participants>
          <person key="thoth-2018-idp169600">
            <firstname>Alexandre</firstname>
            <lastname>Sablayrolles</lastname>
          </person>
          <person key="PASUSERID">
            <firstname>Matthijs</firstname>
            <lastname>Douze</lastname>
          </person>
          <person key="thoth-2018-idp119968">
            <firstname>Cordelia</firstname>
            <lastname>Schmid</lastname>
          </person>
          <person key="PASUSERID">
            <firstname>Hervé</firstname>
            <lastname>Jégou</lastname>
          </person>
        </participants>
        <p>Discretizing multi-dimensional data distributions is a fundamental step of modern indexing methods. State-of-the-art techniques learn parameters of quantizers on training data for optimal performance, thus adapting quantizers to the data.
In this work <ref xlink:href="#thoth-2019-bid3" location="biblio" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/>, we propose to reverse this paradigm and adapt the data to the quantizer: we train a neural net which last layer forms a fixed parameter-free quantizer, such as pre-defined points of a hyper-sphere.
As a proxy objective, we design and train a neural network that favors uniformity in the spherical latent space, while preserving the neighborhood structure after the mapping.
We propose a new regularizer derived from the Kozachenko–Leonenko differential entropy estimator to enforce uniformity and combine it with a locality-aware triplet loss.
Experiments show that our end-to-end approach outperforms most learned quantization methods, and is competitive with the state of the art on widely adopted benchmarks.
Furthermore, we show that training without the quantization step results in almost no difference in accuracy, but yields a generic catalyzer <ref xlink:href="#uid85" location="intern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/> that can be applied with any subsequent quantizer.
The code is available online.</p>
        <object id="uid85">
          <table>
            <tr>
              <td>
                <ressource xlink:href="IMG/asablayr2.png" type="figure" width="320.25pt" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest" media="WEB"/>
              </td>
            </tr>
          </table>
          <caption>Our method learns a network that encodes the input space <formula type="inline"><math xmlns="http://www.w3.org/1998/Math/MathML" overflow="scroll"><msup><mi>ℝ</mi><mi>d</mi></msup></math></formula> into a code <formula type="inline"><math xmlns="http://www.w3.org/1998/Math/MathML" overflow="scroll"><mrow><mi>c</mi><mo>(</mo><mi>x</mi><mo>)</mo></mrow></math></formula>.
It is learned end-to-end, yet the part of the network in charge of the discretization operation is fixed in advance, thereby avoiding optimization problems.
The learnable function f, namely the “catalyzer", is optimized to increase the quality of the subsequent coding stage.</caption>
        </object>
      </subsection>
      <subsection id="uid86" level="2">
        <bodyTitle>Diversity with Cooperation: Ensemble Methods for Few-Shot Classification</bodyTitle>
        <participants>
          <person key="thoth-2018-idp142832">
            <firstname>Nikita</firstname>
            <lastname>Dvornik</lastname>
          </person>
          <person key="thoth-2018-idp119968">
            <firstname>Cordelia</firstname>
            <lastname>Schmid</lastname>
          </person>
          <person key="thoth-2018-idp112112">
            <firstname>Julien</firstname>
            <lastname>Mairal</lastname>
          </person>
        </participants>
        <p>Few-shot classification consists of learning a predictive model that is able to
effectively adapt to a new class, given only a few annotated samples. To solve
this challenging problem, meta-learning has become a popular paradigm that
advocates the ability to “learn to adapt". Recent works have shown, however,
that simple learning strategies without meta-learning could be competitive. In
our ICCV'19 paper <ref xlink:href="#thoth-2019-bid4" location="biblio" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/>, we go a step further and show
that by addressing the fundamental high-variance issue of few-shot learning
classifiers, it is possible to significantly outperform current meta-learning
techniques. Our approach consists of designing an ensemble of deep networks to
leverage the variance of the classifiers, and introducing new strategies to
encourage the networks to cooperate, while encouraging prediction diversity, as
illustrated in Figure <ref xlink:href="#uid87" location="intern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/>. Evaluation is conducted on the
mini-ImageNet and CUB datasets, where we show that even a single network
obtained by distillation yields state-of-the-art results.</p>
        <object id="uid87">
          <table>
            <tr>
              <td>
                <ressource xlink:href="IMG/dvornik1.png" type="figure" width="422.73235pt" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest" media="WEB"/>
              </td>
            </tr>
          </table>
          <caption><b>Illustration of the cooperation and diversity strategies on
two networks.</b> All networks receive the same image as input and compute
corresponding class probabilities with softmax. Cooperation encourages the
non-ground truth probabilities (in red) to be similar, after normalization,
whereas diversity encourages orthogonality.</caption>
        </object>
      </subsection>
      <subsection id="uid88" level="2">
        <bodyTitle>Unsupervised Pre-Training of Image Features on Non-Curated Data</bodyTitle>
        <participants>
          <person key="thoth-2018-idp135488">
            <firstname>Mathilde</firstname>
            <lastname>Caron</lastname>
          </person>
          <person key="PASUSERID">
            <firstname>Piotr</firstname>
            <lastname>Bojanowski</lastname>
            <moreinfo>Facebook AI</moreinfo>
          </person>
          <person key="thoth-2018-idp112112">
            <firstname>Julien</firstname>
            <lastname>Mairal</lastname>
          </person>
          <person key="PASUSERID">
            <firstname>Armand</firstname>
            <lastname>Joulin</lastname>
            <moreinfo>Facebook AI</moreinfo>
          </person>
        </participants>
        <p>Pre-training general-purpose visual features with convolutional neural networks without relying on annotations is a challenging and important task.
Most recent efforts in unsupervised feature learning have focused on either small or highly curated datasets like ImageNet, whereas using non-curated raw datasets was found to decrease the feature quality when evaluated on a transfer task.
Our goal is to bridge the performance gap between unsupervised methods trained on curated data, which are costly to obtain, and massive raw datasets that are easily available.
To that effect, we propose a new unsupervised approach, DeeperCluster <ref xlink:href="#thoth-2019-bid5" location="biblio" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/>, described in Figure <ref xlink:href="#uid89" location="intern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/> which leverages self-supervision and clustering to capture complementary statistics from large-scale data.
We validate our approach on 96 million images from YFCC100M, achieving state-of-the-art results among unsupervised methods on standard benchmarks, which confirms the potential of unsupervised learning when only non-curated raw data are available.
We also show that pre-training a supervised VGG-16 with our method achieves <formula type="inline"><math xmlns="http://www.w3.org/1998/Math/MathML" overflow="scroll"><mrow><mn>74</mn><mo>.</mo><mn>9</mn><mo>%</mo></mrow></math></formula> top-1 classification accuracy on the validation set of ImageNet, which is an improvement of <formula type="inline"><math xmlns="http://www.w3.org/1998/Math/MathML" overflow="scroll"><mrow><mo>+</mo><mn>0</mn><mo>.</mo><mn>8</mn><mo>%</mo></mrow></math></formula> over the same network trained from scratch.</p>
        <object id="uid89">
          <table>
            <tr>
              <td>
                <ressource xlink:href="IMG/mathilde1.png" type="figure" width="320.25pt" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest" media="WEB"/>
              </td>
            </tr>
          </table>
          <caption>DeeperCluster alternates between a hierachical clustering of the features and learning the parameters of a convnet by predicting both the rotation angle and the cluster assignments in a single hierachical loss.</caption>
        </object>
      </subsection>
      <subsection id="uid90" level="2">
        <bodyTitle>Learning to Augment Synthetic Images for Sim2Real Policy Transfer</bodyTitle>
        <participants>
          <person key="thoth-2018-idp167168">
            <firstname>Alexander</firstname>
            <lastname>Pashevich</lastname>
          </person>
          <person key="willow-2018-idp154880">
            <firstname>Robin</firstname>
            <lastname>Strudel</lastname>
            <moreinfo>Inria WILLOW</moreinfo>
          </person>
          <person key="willow-2018-idp174496">
            <firstname>Igor</firstname>
            <lastname>Kalevatykh</lastname>
            <moreinfo>Inria WILLOW</moreinfo>
          </person>
          <person key="PASUSERID">
            <firstname>Ivan</firstname>
            <lastname>Laptev</lastname>
            <moreinfo>Inria WILLOW</moreinfo>
          </person>
          <person key="thoth-2018-idp119968">
            <firstname>Cordelia</firstname>
            <lastname>Schmid</lastname>
          </person>
        </participants>
        <p>Vision and learning have made significant progress that could improve robotics policies for complex tasks and environments. Learning deep neural networks for image understanding, however, requires large amounts of domain-specific visual data. While collecting such data from real robots is possible, such an approach limits the scalability as learning policies typically requires thousands of trials. In this work <ref xlink:href="#thoth-2019-bid6" location="biblio" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/> we attempt to learn manipulation policies in simulated environments. Simulators enable scalability and provide access to the underlying world state during training. Policies learned in simulators, however, do not transfer well to real scenes given the domain gap between real and synthetic data. We follow recent work on domain randomization and augment synthetic images with sequences of random transformations. Our main contribution is to optimize the augmentation strategy for sim2real transfer and to enable domain-independent policy learning, as illustrated in Figure <ref xlink:href="#uid91" location="intern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/>. We design an efficient search for depth image augmentations using object localization as a proxy task. Given the resulting sequence of random transformations, we use it to augment synthetic depth images during policy learning. Our augmentation strategy is policy-independent and enables policy learning with no real images. We demonstrate our approach to significantly improve accuracy on three manipulation tasks evaluated on a real robot.</p>
        <object id="uid91">
          <table>
            <tr>
              <td>
                <ressource xlink:href="IMG/pashevich1.png" type="figure" width="320.25pt" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest" media="WEB"/>
              </td>
            </tr>
          </table>
          <caption>Overview of the method. Our contribution is the policy-independent learning of depth image augmentations (left). The resulting sequence of augmentations is applied to synthetic depth images while learning manipulation policies in a simulator (middle). The learned policies are directly applied to real robot scenes without finetuning on real images.</caption>
        </object>
      </subsection>
      <subsection id="uid92" level="2">
        <bodyTitle>Learning to combine primitive skills: A step towards versatile robotic manipulation</bodyTitle>
        <participants>
          <person key="willow-2018-idp154880">
            <firstname>Robin</firstname>
            <lastname>Strudel</lastname>
            <moreinfo>Inria WILLOW</moreinfo>
          </person>
          <person key="thoth-2018-idp167168">
            <firstname>Alexander</firstname>
            <lastname>Pashevich</lastname>
          </person>
          <person key="willow-2018-idp174496">
            <firstname>Igor</firstname>
            <lastname>Kalevatykh</lastname>
            <moreinfo>Inria WILLOW</moreinfo>
          </person>
          <person key="PASUSERID">
            <firstname>Ivan</firstname>
            <lastname>Laptev</lastname>
            <moreinfo>Inria WILLOW</moreinfo>
          </person>
          <person key="PASUSERID">
            <firstname>Josef</firstname>
            <lastname>Sivic</lastname>
            <moreinfo>Inria WILLOW</moreinfo>
          </person>
          <person key="thoth-2018-idp119968">
            <firstname>Cordelia</firstname>
            <lastname>Schmid</lastname>
          </person>
        </participants>
        <p>Manipulation tasks such as preparing a meal or assembling furniture remain highly challenging for robotics and vision. Traditional task and motion planning (TAMP) methods can solve complex tasks but require full state observability and are not adapted to dynamic scene changes. Recent learning methods can operate directly on visual inputs but typically require many demonstrations and/or task-specific reward engineering. In this work <ref xlink:href="#thoth-2019-bid7" location="biblio" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/> we aim to overcome previous limitations and propose a reinforcement learning (RL) approach to task planning that learns to combine primitive skills illustrated in Figure <ref xlink:href="#uid93" location="intern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/>. First, compared to previous learning methods, our approach requires neither intermediate rewards nor complete task demonstrations during training. Second, we demonstrate the versatility of our vision-based task planning in challenging settings with temporary occlusions and dynamic scene changes. Third, we propose an efficient training of basic skills from few synthetic demonstrations by exploring recent CNN architectures and data augmentation. Notably, while all of our policies are learned on visual inputs in simulated environments, we demonstrate the successful transfer and high success rates when applying such policies to manipulation tasks on a real UR5 robotic arm.</p>
        <object id="uid93">
          <table>
            <tr>
              <td>
                <ressource xlink:href="IMG/pashevich2.png" type="figure" width="427.0pt" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest" media="WEB"/>
              </td>
            </tr>
          </table>
          <caption>Illustration of our approach. (Left): Temporal hierarchy of master and skill policies. The master policy <formula type="inline"><math xmlns="http://www.w3.org/1998/Math/MathML" overflow="scroll"><msub><mi>π</mi><mi>m</mi></msub></math></formula> is executed at a coarse interval of <formula type="inline"><math xmlns="http://www.w3.org/1998/Math/MathML" overflow="scroll"><mi>n</mi></math></formula> time-steps to select among <formula type="inline"><math xmlns="http://www.w3.org/1998/Math/MathML" overflow="scroll"><mi>K</mi></math></formula> skill policies <formula type="inline"><math xmlns="http://www.w3.org/1998/Math/MathML" overflow="scroll"><mrow><msubsup><mi>π</mi><mi>s</mi><mn>1</mn></msubsup><mo>...</mo><msubsup><mi>π</mi><mi>s</mi><mi>K</mi></msubsup></mrow></math></formula>. Each skill policy generates control for a primitive action such as <i>grasping</i> or <i>pouring</i>. (Right): CNN architecture used for the skill and master policies.</caption>
        </object>
      </subsection>
      <subsection id="uid94" level="2">
        <bodyTitle>Probabilistic Reconstruction Networks for 3D Shape Inference from a Single Image</bodyTitle>
        <participants>
          <person key="thoth-2018-idp154992">
            <firstname>Roman</firstname>
            <lastname>Klokov</lastname>
          </person>
          <person key="thoth-2018-idp122832">
            <firstname>Jakob</firstname>
            <lastname>Verbeek</lastname>
          </person>
          <person key="PASUSERID">
            <firstname>Edmond</firstname>
            <lastname>Boyer</lastname>
            <moreinfo>Inria Morpheo</moreinfo>
          </person>
        </participants>
        <p>In our BMVC'19 paper <ref xlink:href="#thoth-2019-bid0" location="biblio" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/>, we study end-to-end learning strategies for 3D shape inference from images, in particular from a single image. Several approaches in this direction have been investigated that explore different shape representations and suitable learning architectures. We focus instead on the underlying probabilistic mechanisms involved and contribute a more principled probabilistic inference-based reconstruction framework, which we coin Probabilistic Reconstruction Networks. This framework expresses image conditioned 3D shape inference through a family of latent variable models, and naturally decouples the choice of shape representations from the inference itself. Moreover, it suggests different options for the image conditioning and allows training in two regimes, using either Monte Carlo or variational approximation of the marginal likelihood. Using our Probabilistic Reconstruction Networks we obtain single image 3D reconstruction results that set a new state of the art on the ShapeNet dataset in terms of the intersection over union and earth mover's distance evaluation metrics. Interestingly, we obtain these results using a basic voxel grid representation, improving over recent work based on finer point cloud or mesh based representations. In Figure <ref xlink:href="#uid95" location="intern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/> we show a schematic overview of our model.</p>
        <object id="uid95">
          <table>
            <tr>
              <td>
                <ressource xlink:href="IMG/rklokov1.png" type="figure" width="320.25pt" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest" media="WEB"/>
              </td>
            </tr>
          </table>
          <caption>Probabilistic Reconstruction Networks for 3D shape inference from a single image. Arrows show the computational flow through the model, dotted arrows show optional image conditioning. Conditioning between 2D and 3D tensors is achieved by means of FiLM layers. The inference network <formula type="inline"><math xmlns="http://www.w3.org/1998/Math/MathML" overflow="scroll"><msub><mi>q</mi><mi>ψ</mi></msub></math></formula> is only used during training for variational inference.</caption>
        </object>
      </subsection>
      <subsection id="uid96" level="2">
        <bodyTitle>Hierarchical Scene Coordinate Classification and Regression for Visual Localization</bodyTitle>
        <participants>
          <person key="PASUSERID">
            <firstname>Xiaotian</firstname>
            <lastname>Li</lastname>
            <moreinfo>Aalto Univ., Finland</moreinfo>
          </person>
          <person key="PASUSERID">
            <firstname>Shuzhe</firstname>
            <lastname>Wang</lastname>
            <moreinfo>Aalto Univ., Finland</moreinfo>
          </person>
          <person key="PASUSERID">
            <firstname>Li</firstname>
            <lastname>Zhao</lastname>
            <moreinfo>Aalto Univ., Finland</moreinfo>
          </person>
          <person key="thoth-2018-idp122832">
            <firstname>Jakob</firstname>
            <lastname>Verbeek</lastname>
          </person>
          <person key="PASUSERID">
            <firstname>Juho</firstname>
            <lastname>Kannala</lastname>
            <moreinfo>Aalto Univ., Finland</moreinfo>
          </person>
        </participants>
        <p>Visual localization is critical to many applications in computer vision and robotics. To address single-image RGB localization, state-of-the-art feature-based methods match local descriptors between a query image and a pre-built 3D model. Recently, deep neural networks have been exploited to regress the mapping between raw pixels and 3D coordinates in the scene, and thus the matching is implicitly performed by the forward pass through the network. However, in a large and ambiguous environment, learning such a regression task directly can be difficult for a single network. In our paper <ref xlink:href="#thoth-2019-bid8" location="biblio" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/>, we present a new hierarchical scene coordinate network to predict pixel scene coordinates in a coarse-to-fine manner from a single RGB image. The network consists of a series of output layers with each of them conditioned on the previous ones. The final output layer predicts the 3D coordinates and the others produce progressively finer discrete location labels. The proposed method outperforms the baseline regression-only network and allows us to train single compact models which scale robustly to large environments. It sets a new state-of-the-art for single-image RGB localization performance on the 7-Scenes, 12-Scenes, Cambridge Landmarks datasets, and three combined scenes. Moreover, for large-scale outdoor localization on the Aachen Day-Night dataset, our approach is much more accurate than existing scene coordinate regression approaches, and reduces significantly the performance gap w.r.t. explicit feature matching approaches.
In Figure <ref xlink:href="#uid97" location="intern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/> we illustrate the scene coordinate predictions for the Aachen dataset experiments.</p>
        <object id="uid97">
          <table>
            <tr>
              <td>
                <ressource xlink:href="IMG/verbeek1.png" type="inline" width="192.1487pt" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest" media="WEB"/>
              </td>
              <td>
                <ressource xlink:href="IMG/verbeek2.png" type="inline" width="192.1487pt" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest" media="WEB"/>
              </td>
            </tr>
          </table>
          <caption>The scene coordinate predictions are visualized as 2D-2D matches between the query (left) and database (right) images. For each pair, the retrieved database image with the largest number of inliers is selected, and only the inlier matches are visualized. We show that our method is able to produce accurate correspondences for challenging queries.</caption>
        </object>
      </subsection>
      <subsection id="uid98" level="2">
        <bodyTitle>Moulding Humans: Non-parametric 3D Human Shape Estimation from Single Images</bodyTitle>
        <participants>
          <person key="thoth-2018-idp147696">
            <firstname>Valentin</firstname>
            <lastname>Gabeur</lastname>
          </person>
          <person key="PASUSERID">
            <firstname>Jean-Sébastien</firstname>
            <lastname>Franco</lastname>
            <moreinfo>Inria Morpheo</moreinfo>
          </person>
          <person key="thoth-2018-idp191568">
            <firstname>Xavier</firstname>
            <lastname>Martin</lastname>
          </person>
          <person key="thoth-2018-idp119968">
            <firstname>Cordelia</firstname>
            <lastname>Schmid</lastname>
          </person>
          <person key="thoth-2018-idp117488">
            <firstname>Gregory</firstname>
            <lastname>Rogez</lastname>
            <moreinfo>NAVER LABS Europe</moreinfo>
          </person>
        </participants>
        <p>While the recent progress in convolutional neural networks has allowed impressive results for 3D human pose estimation, estimating the full 3D shape of a person is still an open issue. Model-based approaches can output precise meshes of naked under-cloth human bodies but fail to estimate details and un-modelled elements such as hair or clothing. On the other hand, non-parametric volumetric approaches can potentially estimate complete shapes but, in practice, they are limited by the resolution of the output grid and cannot produce detailed estimates. In this paper <ref xlink:href="#thoth-2019-bid9" location="biblio" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/>, we propose a non-parametric approach that employs a double depth map <ref xlink:href="#uid99" location="intern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/> to represent the 3D shape of a person: a visible depth map and a “hidden” depth map are estimated and combined, to reconstruct the human 3D shape as done with a “mould”. This representation through 2D depth maps allows a higher resolution output with a much lower dimension than voxel-based volumetric representations.</p>
        <object id="uid99">
          <table>
            <tr>
              <td>
                <ressource xlink:href="IMG/vgabeur1.png" type="figure" width="320.25pt" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest" media="WEB"/>
              </td>
            </tr>
          </table>
          <caption>Given a single image, we estimate the “visible” and the “hidden” depth maps. The 3D point clouds of these 2 depth maps are combined to form a full-body 3D point cloud, as if lining up the 2 halves of a “mould”. The 3D shape is then reconstructed using Poisson reconstruction. An adversarial training with a discriminator is employed to increase the humanness of the estimation.</caption>
        </object>
      </subsection>
      <subsection id="uid100" level="2">
        <bodyTitle>Focused Attention for Action Recognition</bodyTitle>
        <participants>
          <person key="thoth-2018-idp174464">
            <firstname>Vladyslav</firstname>
            <lastname>Sydorov</lastname>
          </person>
          <person key="thoth-2018-idp115024">
            <firstname>Karteek</firstname>
            <lastname>Alahari</lastname>
          </person>
        </participants>
        <p>In this paper <ref xlink:href="#thoth-2019-bid10" location="biblio" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/>, we introduce an attention model for video action recognition that allows processing video in higher resolution, by focusing on the relevant regions first. The network-specific saliency is utilized to guide the cropping, we illustrate the procedure in Figure <ref xlink:href="#uid101" location="intern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/>. We show performance improvement on the Charades dataset with this strategy.</p>
        <object id="uid101">
          <table>
            <tr>
              <td>
                <ressource xlink:href="IMG/vsydorov1.jpg" type="inline" width="149.4526pt" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest" media="WEB"/>
              </td>
              <td>
                <ressource xlink:href="IMG/vsydorov2.jpg" type="inline" width="149.4526pt" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest" media="WEB"/>
              </td>
            </tr>
          </table>
          <caption>Example of attention on Charades action recognition dataset. (Left) Saliency scores (displayed as a heatmap) are localized around the object, a box maximizing the saliency measure within is selected. (Right) The network is provided with the relevant crop of the video, and can process it at a higher resolution.</caption>
        </object>
      </subsection>
    </subsection>
    <subsection id="uid102" level="1">
      <bodyTitle>Statistical Machine Learning</bodyTitle>
      <subsection id="uid103" level="2">
        <bodyTitle>A Contextual Bandit Bake-off</bodyTitle>
        <participants>
          <person key="thoth-2018-idp133056">
            <firstname>Alberto</firstname>
            <lastname>Bietti</lastname>
          </person>
          <person key="PASUSERID">
            <firstname>Alekh</firstname>
            <lastname>Agarwal</lastname>
            <moreinfo>Microsoft Research</moreinfo>
          </person>
          <person key="PASUSERID">
            <firstname>John</firstname>
            <lastname>Langford</lastname>
            <moreinfo>Microsoft Research</moreinfo>
          </person>
        </participants>
        <p>Contextual bandit algorithms are essential for solving many real-world
interactive machine learning problems. Despite multiple recent successes
on statistically and computationally efficient methods, the practical
behavior of these algorithms is still poorly understood.
In , we leverage the availability of large numbers of supervised learning datasets to
compare and empirically optimize contextual bandit algorithms,
focusing on practical methods that learn by relying on optimization oracles
from supervised learning. We find that a recent method using optimism under uncertainty works the best overall.
A surprisingly close second is a simple greedy baseline that only explores implicitly through the diversity of contexts, followed by a variant of Online Cover which tends to be more conservative but robust to problem specification by design. Along the way, we also evaluate and improve several internal components of contextual bandit algorithm design. Overall, this is a thorough study and review of contextual bandit methodology.</p>
      </subsection>
      <subsection id="uid104" level="2">
        <bodyTitle>A Generic Acceleration Framework for Stochastic Composite Optimization</bodyTitle>
        <participants>
          <person key="thoth-2018-idp157424">
            <firstname>Andrei</firstname>
            <lastname>Kulunchakov</lastname>
          </person>
          <person key="thoth-2018-idp112112">
            <firstname>Julien</firstname>
            <lastname>Mairal</lastname>
          </person>
        </participants>
        <p>In <ref xlink:href="#thoth-2019-bid11" location="biblio" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/>, we introduce various mechanisms to obtain accelerated first-order stochasticoptimization algorithms when the objective function is convex or stronglyconvex. Specifically, we extend the Catalyst approach originally designed fordeterministic objectives to the stochastic setting. Given an optimizationmethod with mild convergence guarantees for strongly convex problems,the challenge is to accelerate convergence to a noise-dominated region, andthen achieve convergence with an optimal worst-case complexity depending on thenoise variance of the gradients.A side contribution of our work is also a generic analysis that canhandle inexact proximal operators, providing new insights about the robustness of stochastic algorithms when the proximal operator cannot be exactly computed. An illustration from this work is explained in Figure <ref xlink:href="#uid105" location="intern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/>.</p>
        <object id="uid105">
          <table>
            <tr>
              <td>
                <ressource xlink:href="IMG/akulunchakov1.png" type="figure" width="362.9526pt" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest" media="WEB"/>
              </td>
            </tr>
          </table>
          <caption>Accelerating SVRG-like (top) and SAGA (bottom) methods for <formula type="inline"><math xmlns="http://www.w3.org/1998/Math/MathML" overflow="scroll"><msub><mi>ℓ</mi><mn>2</mn></msub></math></formula>-logistic regression with <formula type="inline"><math xmlns="http://www.w3.org/1998/Math/MathML" overflow="scroll"><mrow><mi>μ</mi><mo>=</mo><mn>1</mn><mo>/</mo><mo>(</mo><mn>100</mn><mi>n</mi><mo>)</mo></mrow></math></formula> (bottom) for mild dropout, which imitates stochasticity in the gradients. All plots are on a logarithmic scale for the objective function value, and the <formula type="inline"><math xmlns="http://www.w3.org/1998/Math/MathML" overflow="scroll"><mi>x</mi></math></formula>-axis denotes the number of epochs. The colored tubes around each curve denote a standard deviations across 5 runs. The curves show that acceleration may be useful even in the stochastic optimization regime.</caption>
        </object>
      </subsection>
      <subsection id="uid106" level="2">
        <bodyTitle>Estimate Sequences for Variance-Reduced Stochastic Composite Optimization</bodyTitle>
        <participants>
          <person key="thoth-2018-idp157424">
            <firstname>Andrei</firstname>
            <lastname>Kulunchakov</lastname>
          </person>
          <person key="thoth-2018-idp112112">
            <firstname>Julien</firstname>
            <lastname>Mairal</lastname>
          </person>
        </participants>
        <p>In <ref xlink:href="#thoth-2019-bid12" location="biblio" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/>, we propose a unified view of gradient-based algorithms for stochastic convex composite optimization. By extending the concept of estimate sequence introduced by Nesterov, we interpret a large class of stochastic optimization methods as procedures that iteratively minimize a surrogate of the objective. This point of view covers stochastic gradient descent (SGD), the variance-reduction approaches SAGA, SVRG, MISO, their proximal variants, and has several advantages: (i) we provide a simple generic proof of convergence for all of the aforementioned methods; (ii) we naturally obtain new algorithms with the same guarantees; (iii) we derive generic strategies to make these algorithms robust to stochastic noise, which is useful when data is corrupted by small random perturbations. Finally, we show that this viewpoint is useful to obtain accelerated algorithms. A comparison with different approaches is shown in Figure <ref xlink:href="#uid107" location="intern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/>.</p>
        <object id="uid107">
          <table>
            <tr>
              <td>
                <ressource xlink:href="IMG/akulunchakov2.png" type="figure" width="362.9526pt" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest" media="WEB"/>
              </td>
            </tr>
          </table>
          <caption>Comparison of different standard approaches with our developed method on two datasets for <formula type="inline"><math xmlns="http://www.w3.org/1998/Math/MathML" overflow="scroll"><msub><mi>ℓ</mi><mn>2</mn></msub></math></formula>-logistic regression with mild dropout (bottom) and deterministic case (above). The case of exact gradient computations clearly shows benefits from acceleration, which consist in fast linear convergence. In the stochastic case, we demonstrate either superiority or high competitiveness of the developed method along with its unbiased convergence to the optimum. In both cases, we show that acceleration is able to generically comprise strengths of standard methods and even outperform them.</caption>
        </object>
      </subsection>
      <subsection id="uid108" level="2">
        <bodyTitle>White-box vs Black-box: Bayes Optimal Strategies for Membership Inference</bodyTitle>
        <participants>
          <person key="thoth-2018-idp169600">
            <firstname>Alexandre</firstname>
            <lastname>Sablayrolles</lastname>
          </person>
          <person key="PASUSERID">
            <firstname>Matthijs</firstname>
            <lastname>Douze</lastname>
          </person>
          <person key="PASUSERID">
            <firstname>Yann</firstname>
            <lastname>Ollivier</lastname>
          </person>
          <person key="thoth-2018-idp119968">
            <firstname>Cordelia</firstname>
            <lastname>Schmid</lastname>
          </person>
          <person key="PASUSERID">
            <firstname>Hervé</firstname>
            <lastname>Jégou</lastname>
          </person>
        </participants>
        <p>Membership inference determines, given a sample and trained parameters of a machine learning model, whether the sample was part of the training set.
In this paper <ref xlink:href="#thoth-2019-bid13" location="biblio" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/>, we derive the optimal strategy for membership inference with a few assumptions on the distribution of the parameters.
We show that optimal attacks only depend on the loss function, and thus black-box attacks are as good as white-box attacks.
As the optimal strategy is not tractable, we provide approximations of it leading to several inference methods <ref xlink:href="#uid109" location="intern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/>, and show that existing membership inference methods are coarser approximations of this optimal strategy.
Our membership attacks outperform the state of the art in various settings, ranging from a simple logistic regression to more complex architectures and datasets, such as ResNet-101 and Imagenet.</p>
        <object id="uid109">
          <table>
            <tr>
              <td>
                <ressource xlink:href="IMG/asablayr1.png" type="figure" width="106.75pt" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest" media="WEB"/>
              </td>
            </tr>
          </table>
          <caption>Plate notation of the membership inference problem: for each data point <formula type="inline"><math xmlns="http://www.w3.org/1998/Math/MathML" overflow="scroll"><msub><mi>z</mi><mi>i</mi></msub></math></formula>, a binary membership variable <formula type="inline"><math xmlns="http://www.w3.org/1998/Math/MathML" overflow="scroll"><msub><mi>m</mi><mi>i</mi></msub></math></formula> is sampled, and <formula type="inline"><math xmlns="http://www.w3.org/1998/Math/MathML" overflow="scroll"><msub><mi>z</mi><mi>i</mi></msub></math></formula> belongs to the training set iff <formula type="inline"><math xmlns="http://www.w3.org/1998/Math/MathML" overflow="scroll"><mrow><msub><mi>m</mi><mi>i</mi></msub><mo>=</mo><mn>1</mn></mrow></math></formula>.
Given the trained parameters <formula type="inline"><math xmlns="http://www.w3.org/1998/Math/MathML" overflow="scroll"><mi>θ</mi></math></formula> and a sample <formula type="inline"><math xmlns="http://www.w3.org/1998/Math/MathML" overflow="scroll"><msub><mi>z</mi><mi>i</mi></msub></math></formula>, we want to infer the value of <formula type="inline"><math xmlns="http://www.w3.org/1998/Math/MathML" overflow="scroll"><msub><mi>m</mi><mi>i</mi></msub></math></formula>.</caption>
        </object>
      </subsection>
    </subsection>
    <subsection id="uid110" level="1">
      <bodyTitle>Theory and Methods for Deep Neural Networks</bodyTitle>
      <subsection id="uid111" level="2">
        <bodyTitle>Group Invariance, Stability to Deformations, and Complexity of Deep Convolutional Representations</bodyTitle>
        <participants>
          <person key="thoth-2018-idp133056">
            <firstname>Alberto</firstname>
            <lastname>Bietti</lastname>
          </person>
          <person key="thoth-2018-idp112112">
            <firstname>Julien</firstname>
            <lastname>Mairal</lastname>
          </person>
        </participants>
        <p>The success of deep convolutional architectures is often attributed in part to their ability to learn multiscale and invariant representations of natural signals. However, a precise study of these properties and how they affect learning guarantees is still missing. In the paper <ref xlink:href="#thoth-2019-bid14" location="biblio" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/>, we consider deep convolutional representations of signals; we study their invariance to translations and to more general groups of transformations, their stability to the action of diffeomorphisms, and their ability to preserve signal information. This analysis is carried by introducing a multilayer kernel based on convolutional kernel networks and by studying the geometry induced by the kernel mapping. We then characterize the corresponding reproducing kernel Hilbert space (RKHS), showing that it contains a large class of convolutional neural networks with homogeneous activation functions. This analysis allows us to separate data representation from learning, and to provide a canonical measure of model complexity, the RKHS norm, which controls both stability and generalization of any learned model. In addition to models in the constructed RKHS, our stability analysis also applies to convolutional networks with generic activations such as rectified linear units, and we discuss its relationship with recent generalization bounds based on spectral norms.</p>
      </subsection>
      <subsection id="uid112" level="2">
        <bodyTitle>A Kernel Perspective for Regularizing Deep Neural Networks</bodyTitle>
        <participants>
          <person key="thoth-2018-idp133056">
            <firstname>Alberto</firstname>
            <lastname>Bietti</lastname>
          </person>
          <person key="thoth-2018-idp164736">
            <firstname>Grégoire</firstname>
            <lastname>Mialon</lastname>
          </person>
          <person key="thoth-2018-idp137968">
            <firstname>Dexiong</firstname>
            <lastname>Chen</lastname>
          </person>
          <person key="thoth-2018-idp112112">
            <firstname>Julien</firstname>
            <lastname>Mairal</lastname>
          </person>
        </participants>
        <p>We propose a new point of view for regularizing deep neural networks by using the norm of a reproducing kernel Hilbert space (RKHS) <ref xlink:href="#thoth-2019-bid15" location="biblio" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/>. Even though this norm cannot be computed, it admits upper and lower approximations leading to various practical strategies. Specifically, this perspective (i) provides a common umbrella for many existing regularization principles, including spectral norm and gradient penalties, or adversarial training, (ii) leads to new effective regularization penalties, and (iii) suggests hybrid strategies combining lower and upper bounds to get better approximations of the RKHS norm. We experimentally show this approach to be effective when learning on small datasets, or to obtain adversarially robust models.</p>
      </subsection>
      <subsection id="uid113" level="2">
        <bodyTitle>On the Inductive Bias of Neural Tangent Kernels</bodyTitle>
        <participants>
          <person key="thoth-2018-idp133056">
            <firstname>Alberto</firstname>
            <lastname>Bietti</lastname>
          </person>
          <person key="thoth-2018-idp112112">
            <firstname>Julien</firstname>
            <lastname>Mairal</lastname>
          </person>
        </participants>
        <p>State-of-the-art neural networks are heavily over-parameterized, making the optimization algorithm a crucial ingredient for learning predictive models with good generalization properties. A recent line of work has shown that in a certain over-parameterized regime, the learning dynamics of gradient descent are governed by a certain kernel obtained at initialization, called the neural tangent kernel. In <ref xlink:href="#thoth-2019-bid15" location="biblio" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/>, we study the inductive bias of learning in such a regime by analyzing this kernel and the corresponding function space (RKHS). In particular, we study smoothness, approximation, and stability properties of functions with finite norm, including stability to image deformations in the case of convolutional networks, and compare to other known kernels for similar architectures.</p>
      </subsection>
      <subsection id="uid114" level="2">
        <bodyTitle>Large Memory Layers with Product Keys</bodyTitle>
        <participants>
          <person key="PASUSERID">
            <firstname>Guillaume</firstname>
            <lastname>Lample</lastname>
          </person>
          <person key="thoth-2018-idp169600">
            <firstname>Alexandre</firstname>
            <lastname>Sablayrolles</lastname>
          </person>
          <person key="PASUSERID">
            <firstname>Marc'Aurelio</firstname>
            <lastname>Ranzato</lastname>
          </person>
          <person key="PASUSERID">
            <firstname>Ludovic</firstname>
            <lastname>Denoyer</lastname>
          </person>
          <person key="PASUSERID">
            <firstname>Hervé</firstname>
            <lastname>Jégou</lastname>
          </person>
        </participants>
        <p>This paper introduces a structured memory which can be easily integrated into a neural network.
The memory is very large by design and significantly increases the capacity of the architecture, by up to a billion parameters with a negligible computational overhead.
Its design and access pattern is based on product keys, which enable fast and exact nearest neighbor search.
The ability to increase the number of parameters while keeping the same computational budget lets the overall system strike a better trade-off between prediction accuracy and computation efficiency both at training and test time.
This memory layer, shown in Figure <ref xlink:href="#uid115" location="intern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/>, allows us to tackle very large scale language modeling tasks.
In our experiments we consider a dataset with up to 30 billion words, and we plug our memory layer in a state-of-the-art transformer-based architecture.
In particular, we found that a memory augmented model with only 12 layers outperforms a baseline transformer model with 24 layers, while being twice faster at inference time. We release our code for reproducibility purposes.</p>
        <object id="uid115">
          <table>
            <tr>
              <td>
                <ressource xlink:href="IMG/asablayr3.png" type="figure" width="362.9526pt" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest" media="WEB"/>
              </td>
            </tr>
          </table>
          <caption>Overview of a key-value memory layer: The input <formula type="inline"><math xmlns="http://www.w3.org/1998/Math/MathML" overflow="scroll"><mi>x</mi></math></formula> is processed through a query network that produces a query vector <formula type="inline"><math xmlns="http://www.w3.org/1998/Math/MathML" overflow="scroll"><mi>q</mi></math></formula>, which is compared to all the keys.
The output is the sparse weighted sum over the memories associated with the selected keys.
For a large number of keys <formula type="inline"><math xmlns="http://www.w3.org/1998/Math/MathML" overflow="scroll"><mrow><mo>|</mo><mi>𝒦</mi><mo>|</mo></mrow></math></formula>, the key selection procedure becomes too expensive in practice.
Our product key method is exact and makes this search process very fast.</caption>
        </object>
      </subsection>
      <subsection id="uid116" level="2">
        <bodyTitle>Understanding Priors in Bayesian Neural Networks at the Unit Level</bodyTitle>
        <participants>
          <person key="PASUSERID">
            <firstname>Mariia</firstname>
            <lastname>Vladimirova</lastname>
          </person>
          <person key="thoth-2018-idp122832">
            <firstname>Jakob</firstname>
            <lastname>Verbeek</lastname>
          </person>
          <person key="PASUSERID">
            <firstname>Pablo</firstname>
            <lastname>Mesejo</lastname>
            <moreinfo>Univ. Granada, Spain</moreinfo>
          </person>
          <person key="PASUSERID">
            <firstname>Julyan</firstname>
            <lastname>Arbel</lastname>
            <moreinfo>Inria MISTIS</moreinfo>
          </person>
        </participants>
        <p>In our ICML'19 paper <ref xlink:href="#thoth-2019-bid16" location="biblio" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/>, we investigate deep Bayesian neural networks with Gaussian weight priors and a class of ReLUlike nonlinearities. Bayesian neural networks with Gaussian priors are well known to induce an L2, “weight decay”, regularization. Our results characterize a more intricate regularization effect at the level of the unit activations. Our main result establishes that the induced prior distribution on the units before and after activation becomes increasingly heavy-tailed with the depth of the layer. We show that first layer units are Gaussian, second layer units are sub-exponential, and units in deeper layers are characterized by sub-Weibull distributions. Our results provide new theoretical insight on deep Bayesian neural networks, which we corroborate with experimental simulation results.</p>
      </subsection>
      <subsection id="uid117" level="2">
        <bodyTitle>Adaptative Inference Cost With Convolutional Neural Mixture Models</bodyTitle>
        <participants>
          <person key="PASUSERID">
            <firstname>Adria</firstname>
            <lastname>Ruiz</lastname>
          </person>
          <person key="thoth-2018-idp122832">
            <firstname>Jakob</firstname>
            <lastname>Verbeek</lastname>
          </person>
        </participants>
        <p>Despite the outstanding performance of convolutional neural networks (CNNs) for many vision tasks, the required computational cost during inference is problematic when resources are limited. In this paper <ref xlink:href="#thoth-2019-bid17" location="biblio" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/>, we propose Convolutional Neural Mixture Models (CNMMs), a probabilistic model embedding a large number of CNNs that can be jointly trained and evaluated in an efficient manner.
Within the proposed framework, we present different mechanisms to prune subsets of CNNs from the mixture, allowing to easily adapt the computational cost required for inference (see Fig. <ref xlink:href="#uid118" location="intern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/>).
Image classification and semantic segmentation experiments show that our method achieve excellent accuracy-compute trade-offs.
Moreover, unlike most of previous approaches, a single CNMM provides a large range of operating points along this trade-off, without any re-training.</p>
        <object id="uid118">
          <table>
            <tr>
              <td>
                <ressource xlink:href="IMG/aruiz3.png" type="figure" width="277.5474pt" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest" media="WEB"/>
              </td>
            </tr>
          </table>
          <caption>A Convolutional Neural Mixture Model embeds a large number of CNNs.
Weight sharing enables efficient joint training of all networks and computation of the mixture output. The learned mixing weights can be used to remove networks from the mixture, and thus reduce the computational cost of inference.</caption>
        </object>
      </subsection>
    </subsection>
    <subsection id="uid119" level="1">
      <bodyTitle>Pluri-disciplinary Research</bodyTitle>
      <subsection id="uid120" level="2">
        <bodyTitle>Biological Sequence Modeling with Convolutional Kernel Networks</bodyTitle>
        <participants>
          <person key="thoth-2018-idp137968">
            <firstname>Dexiong</firstname>
            <lastname>Chen</lastname>
          </person>
          <person key="PASUSERID">
            <firstname>Laurent</firstname>
            <lastname>Jacob</lastname>
          </person>
          <person key="thoth-2018-idp112112">
            <firstname>Julien</firstname>
            <lastname>Mairal</lastname>
          </person>
        </participants>
        <p>The growing number of annotated biological sequences available makes it possible to learn genotype-phenotype relationships from data with increasingly high accuracy. When large quan- tities of labeled samples are available for training a model, convolutional neural networks can be used to predict the phenotype of unannotated sequences with good accuracy. Unfortunately, their performance with medium- or small-scale datasets is mitigated, which requires inventing new data-efficient approaches. In this paper <ref xlink:href="#thoth-2019-bid18" location="biblio" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/>, <ref xlink:href="#thoth-2019-bid19" location="biblio" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/>, we introduce a hybrid approach between convolutional neural networks and kernel methods to model biological sequences. Our method, shown in Figure <ref xlink:href="#uid121" location="intern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/>, enjoys the ability of convolutional neural networks to learn data representations that are adapted to a specific task, while the kernel point of view yields algorithms that perform significantly better when the amount of training data is small. We illustrate these advantages for transcription factor binding prediction and protein homology detection, and we demonstrate that our model is also simple to interpret, which is crucial for discovering predictive motifs in sequences. The source code is freely available at <ref xlink:href="https://gitlab.inria.fr/dchen/CKN-seq" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>gitlab.<allowbreak/>inria.<allowbreak/>fr/<allowbreak/>dchen/<allowbreak/>CKN-seq</ref>.</p>
        <object id="uid121">
          <table>
            <tr>
              <td>
                <ressource xlink:href="IMG/dchen1.png" type="figure" width="341.6013pt" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest" media="WEB"/>
              </td>
            </tr>
          </table>
          <caption>Construction of single-layer (left) and multilayer (middle) CKN-seq and the approximation of one layer (right). For a single-layer model, each <formula type="inline"><math xmlns="http://www.w3.org/1998/Math/MathML" overflow="scroll"><mi>k</mi></math></formula>-mer <formula type="inline"><math xmlns="http://www.w3.org/1998/Math/MathML" overflow="scroll"><mrow><msub><mi>P</mi><mi>i</mi></msub><mrow><mo>(</mo><mi>𝐱</mi><mo>)</mo></mrow></mrow></math></formula> is mapped to <formula type="inline"><math xmlns="http://www.w3.org/1998/Math/MathML" overflow="scroll"><mrow><msub><mi>ϕ</mi><mn>0</mn></msub><mrow><mo>(</mo><msub><mi>P</mi><mi>i</mi></msub><mrow><mo>(</mo><mi>𝐱</mi><mo>)</mo></mrow><mo>)</mo></mrow></mrow></math></formula> in <formula type="inline"><math xmlns="http://www.w3.org/1998/Math/MathML" overflow="scroll"><mi>ℱ</mi></math></formula> and projected to <formula type="inline"><math xmlns="http://www.w3.org/1998/Math/MathML" overflow="scroll"><mrow><mi>Π</mi><msub><mi>ϕ</mi><mn>0</mn></msub><mrow><mo>(</mo><msub><mi>P</mi><mi>i</mi></msub><mrow><mo>(</mo><mi>𝐱</mi><mo>)</mo></mrow><mo>)</mo></mrow></mrow></math></formula> parametrized by <formula type="inline"><math xmlns="http://www.w3.org/1998/Math/MathML" overflow="scroll"><mrow><msub><mi>ψ</mi><mn>0</mn></msub><mrow><mo>(</mo><msub><mi>P</mi><mi>i</mi></msub><mrow><mo>(</mo><mi>𝐱</mi><mo>)</mo></mrow><mo>)</mo></mrow></mrow></math></formula>. Then, the final finite-dimensional sequence is obtained by the global pooling, <formula type="inline"><math xmlns="http://www.w3.org/1998/Math/MathML" overflow="scroll"><mrow><mi>ψ</mi><mrow><mo>(</mo><mi>𝐱</mi><mo>)</mo></mrow><mo>=</mo><mfrac><mn>1</mn><mi>m</mi></mfrac><msubsup><mo>∑</mo><mrow><mi>i</mi><mo>=</mo><mn>0</mn></mrow><mi>m</mi></msubsup><msub><mi>ψ</mi><mn>0</mn></msub><mrow><mo>(</mo><msub><mi>P</mi><mi>i</mi></msub><mrow><mo>(</mo><mi>𝐱</mi><mo>)</mo></mrow><mo>)</mo></mrow></mrow></math></formula>. The multilayer construction is similar, but relies on intermediate maps, obtained by local pooling.</caption>
        </object>
      </subsection>
      <subsection id="uid122" level="2">
        <bodyTitle>Recurrent Kernel Networks</bodyTitle>
        <participants>
          <person key="thoth-2018-idp137968">
            <firstname>Dexiong</firstname>
            <lastname>Chen</lastname>
          </person>
          <person key="PASUSERID">
            <firstname>Laurent</firstname>
            <lastname>Jacob</lastname>
            <moreinfo>CNRS, LBBE Laboratory</moreinfo>
          </person>
          <person key="thoth-2018-idp112112">
            <firstname>Julien</firstname>
            <lastname>Mairal</lastname>
          </person>
        </participants>
        <p>Substring kernels are classical tools for representing biological sequences or text.
However, when large amounts of annotated data are available, models that allow
end-to-end training such as neural networks are often preferred. Links between
recurrent neural networks (RNNs) and substring kernels have recently been drawn,
by formally showing that RNNs with specific activation functions were points
in a reproducing kernel Hilbert space (RKHS). In this paper <ref xlink:href="#thoth-2019-bid20" location="biblio" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/>, we revisit this link
by generalizing convolutional kernel networks—–originally related to a relaxation
of the mismatch kernel—–to model gaps in sequences. It results in a new type of
recurrent neural network (Figure <ref xlink:href="#uid123" location="intern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/>), which can be trained end-to-end with backpropagation, or
without supervision by using kernel approximation techniques. We experimentally
show that our approach is well suited to biological sequences, where it outperforms
existing methods for protein classification tasks.</p>
        <object id="uid123">
          <table>
            <tr>
              <td>
                <ressource xlink:href="IMG/dchen2.png" type="figure" width="320.25pt" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest" media="WEB"/>
              </td>
            </tr>
          </table>
          <caption>Representation of a sequence in a RKHS based on our kernel.</caption>
        </object>
      </subsection>
      <subsection id="uid124" level="2">
        <bodyTitle>Depth-adaptive Transformer</bodyTitle>
        <participants>
          <person key="thoth-2018-idp145264">
            <firstname>Maha</firstname>
            <lastname>Elbayad</lastname>
          </person>
          <person key="PASUSERID">
            <firstname>Jiatao</firstname>
            <lastname>Gu</lastname>
            <moreinfo>Facebook AI</moreinfo>
          </person>
          <person key="PASUSERID">
            <firstname>Edouard</firstname>
            <lastname>Grave</lastname>
            <moreinfo>Facebook AI</moreinfo>
          </person>
          <person key="PASUSERID">
            <firstname>Michael</firstname>
            <lastname>Auli</lastname>
            <moreinfo>Facebook AI</moreinfo>
          </person>
        </participants>
        <p>State of the art sequence-to-sequence models for large scale tasks perform a fixed number of computations for each input sequence regardless of whether it is easy or hard to process. In our ICLR'2020 paper <ref xlink:href="#thoth-2019-bid21" location="biblio" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/>, we train Transformer models which can make output predictions at different stages of the network and we investigate different ways to predict how much computation is required for a particular sequence. Unlike dynamic computation in Universal Transformers, which applies the same set of layers iteratively, we apply different layers at every step to adjust both the amount of computation as well as the model capacity. On IWSLT German-English translation our approach matches the accuracy of a well tuned baseline Transformer while using less than a quarter of the decoder layers.
Figure <ref xlink:href="#uid125" location="intern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/> illustrates the different halting mechanisms investigated in this work. Namely, a sequence-level approach where we assume all the sequence's tokens are equally difficult and a token-level approach where tokens exit at varying depths.</p>
        <object id="uid125">
          <table>
            <tr>
              <td>
                <ressource xlink:href="IMG/elbayad.png" type="figure" width="405.6487pt" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest" media="WEB"/>
              </td>
            </tr>
          </table>
          <caption>Illustration of the variant adaptive depth predictors: (a) the sequence-level and (b, c) at the token-level.</caption>
        </object>
      </subsection>
    </subsection>
  </resultats>
  <contrats id="uid126">
    <bodyTitle>Bilateral Contracts and Grants with Industry</bodyTitle>
    <subsection id="uid127" level="1">
      <bodyTitle>Intel</bodyTitle>
      <participants>
        <person key="thoth-2018-idp119968">
          <firstname>Cordelia</firstname>
          <lastname>Schmid</lastname>
        </person>
        <person key="thoth-2018-idp115024">
          <firstname>Karteek</firstname>
          <lastname>Alahari</lastname>
        </person>
      </participants>
      <p>The Intel Network on Intelligent Systems in Europe brings together leading
researchers in robotics, computer vision, motor control, and machine
learning. We are part of this network and have participated in the
annual retreat in 2018. Funding will be provided on an annual
basis, every year, as long as we are part of the network.
</p>
    </subsection>
    <subsection id="uid128" level="1">
      <bodyTitle>Facebook</bodyTitle>
      <participants>
        <person key="thoth-2018-idp119968">
          <firstname>Cordelia</firstname>
          <lastname>Schmid</lastname>
        </person>
        <person key="thoth-2018-idp122832">
          <firstname>Jakob</firstname>
          <lastname>Verbeek</lastname>
        </person>
        <person key="thoth-2018-idp112112">
          <firstname>Julien</firstname>
          <lastname>Mairal</lastname>
        </person>
        <person key="thoth-2018-idp115024">
          <firstname>Karteek</firstname>
          <lastname>Alahari</lastname>
        </person>
        <person key="thoth-2018-idp159856">
          <firstname>Pauline</firstname>
          <lastname>Luc</lastname>
        </person>
        <person key="thoth-2018-idp169600">
          <firstname>Alexandre</firstname>
          <lastname>Sablayrolles</lastname>
        </person>
        <person key="thoth-2018-idp135488">
          <firstname>Mathilde</firstname>
          <lastname>Caron</lastname>
        </person>
        <person key="thoth-2019-idp174096">
          <firstname>Lina</firstname>
          <lastname>Mezghani</lastname>
        </person>
      </participants>
      <p>The collaboration started in 2016. The topics include image retrieval
with CNN based descriptors, weakly supervised object detection and
semantic segmentation, and learning structured models for action
recognition in videos. In 2016, Pauline Luc started her PhD funded by
a CIFRE grant, jointly supervised by Jakob Verbeek (Inria) and Camille
Couprie (Facebook AI Research). THOTH has been selected in 2016 as a
recipient for the Facebook GPU Partnership program. In this context Facebook
has donated two state-of-the-art servers with 8 GPUs. In 2017, Alexandre
Sablayrolles started his CIFRE grant, jointly supervised by Cordelia
Schmid, and Herve Jegou and Matthijs Douze at Facebook AI Research. In 2018,
Mathilde Caron started as a CIFRE PhD student, jointly supervised by Julien
Mairal, and Armand Joulin and Piotr Bojanowski at Facebook AI Research. Lina Mezghani is the new PhD student in this collaboration since 2019.
</p>
    </subsection>
    <subsection id="uid129" level="1">
      <bodyTitle>NAVER LABS Europe</bodyTitle>
      <participants>
        <person key="thoth-2018-idp115024">
          <firstname>Karteek</firstname>
          <lastname>Alahari</lastname>
        </person>
      </participants>
      <p>This collaboration started when NAVER LABS Europe was Xerox Research Centre
Europe, and has been on-going since October 2009 with two co-supervised
CIFRE scholarships (2009–2012, 2011-2014). Starting June 2014 we signed a
third collaborative agreement for a duration of three years. The goal is to
develop approaches for deep learning based image description and pose
estimation in videos. Jakob Verbeek and Diane Larlus (XRCE) jointly
supervise a PhD-level intern for a period of 6 months in 2016-2017. XRCE
then became Naver in 2017. A one-year research contract on action
recognition in videos started in Sep 2017. The approach developed by
Vasileios Choutas implements pose-based motion features, which are shown to
be complementary to state-of-the-art I3D features. Nieves Crasto's
internship in 2018 was jointly supervised by Philippe Weinzaepfel
(NAVER LABS), Karteek Alahari and Cordelia Schmid. A new CIFRE PhD contract
was submitted to ANRT for approval in October 2019.
</p>
    </subsection>
    <subsection id="uid130" level="1">
      <bodyTitle>Valeo AI</bodyTitle>
      <participants>
        <person key="thoth-2018-idp115024">
          <firstname>Karteek</firstname>
          <lastname>Alahari</lastname>
        </person>
        <person key="thoth-2019-idp134960">
          <firstname>Florent</firstname>
          <lastname>Bartoccioni</lastname>
        </person>
      </participants>
      <p>This collaboration started in 2019 with the arrival of PhD student Florent
Bartoccioni. Despite the progress seen in computer vision, artificial systems
lack the capability to address the large disparity between human and
machine-based scene understanding. For example, at any road intersection most
people have the ability to accurately forecast or anticipate events in this
scenario, such as changes in colour of the traffic lights, when and how
pedestrians are likely to cross the street. This apparently natural human
behaviour is not replicable by state-of-theart computer vision methods, which
are ill-equipped to make such forecasts. The goal of this collaborative PhD is
to address this forecasting problem.
</p>
    </subsection>
    <subsection id="uid131" level="1">
      <bodyTitle>Criteo</bodyTitle>
      <participants>
        <person key="thoth-2018-idp112112">
          <firstname>Julien</firstname>
          <lastname>Mairal</lastname>
        </person>
      </participants>
      <p>This collaboration started in April 2019, with the arrival of a master student, Houssam Zenati,
who will pursue a CIFRE PhD starting in 2020. The goal of this collaboration is to develop
machine learning techniques for counterfactual loss optimization, which is a
fundamental problem in machine learning related to causal inference. The
goal is to learn stochastic policies, based on offline logged data. The
problem is important for web advertising, which is the main activity of the
Criteo company, but the potential scope of application is much larger, with
possible applications in medicine and experimental sciences.
</p>
    </subsection>
    <subsection id="uid132" level="1">
      <bodyTitle>Google</bodyTitle>
      <participants>
        <person key="thoth-2018-idp115024">
          <firstname>Karteek</firstname>
          <lastname>Alahari</lastname>
        </person>
        <person key="thoth-2018-idp130624">
          <firstname>Minttu</firstname>
          <lastname>Alakuijala</lastname>
        </person>
        <person key="thoth-2018-idp147696">
          <firstname>Valentin</firstname>
          <lastname>Gabeur</lastname>
        </person>
        <person key="thoth-2018-idp112112">
          <firstname>Julien</firstname>
          <lastname>Mairal</lastname>
        </person>
      </participants>
      <p>This collaboration started in February 2019, with the arrival of two CIFRE PhD students,
Minttu Alakuijala and Valentin Gabeur, who are respectively working on visual models
for robotics, and 3D human pose estimation.
</p>
    </subsection>
  </contrats>
  <partenariat id="uid133">
    <bodyTitle>Partnerships and Cooperations</bodyTitle>
    <subsection id="uid134" level="1">
      <bodyTitle>Regional Initiatives</bodyTitle>
      <subsection id="uid135" level="2">
        <bodyTitle>MIAI chair - Towards more data efficiency in machine learning</bodyTitle>
        <participants>
          <person key="thoth-2018-idp112112">
            <firstname>Julien</firstname>
            <lastname>Mairal</lastname>
          </person>
          <person key="thoth-2018-idp115024">
            <firstname>Karteek</firstname>
            <lastname>Alahari</lastname>
          </person>
          <person key="thoth-2018-idp122832">
            <firstname>Jakob</firstname>
            <lastname>Verbeek</lastname>
          </person>
        </participants>
        <p>Julien Mairal holds a chair of the 3IA MIAI institute. The goal is to improve the data
efficiency of machine learning algorithms.</p>
      </subsection>
      <subsection id="uid136" level="2">
        <bodyTitle>MIAI chair - Towards self-supervised visual learning</bodyTitle>
        <participants>
          <person key="thoth-2018-idp119968">
            <firstname>Cordelia</firstname>
            <lastname>Schmid</lastname>
          </person>
        </participants>
        <p>Cordelia Schmid holds a chair of the 3IA MIAI institute. The goal is to develop
new self-supervised learning methods for computer vision.</p>
      </subsection>
      <subsection id="uid137" level="2">
        <bodyTitle>MIAI chair - Multiscale, multimodal and multitemporal remote sensing</bodyTitle>
        <participants>
          <person key="thoth-2019-idp127152">
            <firstname>Jocelyn</firstname>
            <lastname>Chanussot</lastname>
          </person>
        </participants>
        <p>Jocelyn Chanussot holds a chair of the 3IA MIAI institute.</p>
      </subsection>
      <subsection id="uid138" level="2">
        <bodyTitle>DeCore (Deep Convolutional and Recurrent networks for image, speech, and text)</bodyTitle>
        <participants>
          <person key="thoth-2018-idp122832">
            <firstname>Jakob</firstname>
            <lastname>Verbeek</lastname>
          </person>
          <person key="thoth-2018-idp145264">
            <firstname>Maha</firstname>
            <lastname>Elbayad</lastname>
          </person>
        </participants>
        <p>DeCore is a project-team funded by the Persyval Lab for 3.5 years (september
2016 - February 2020), coordinated by Jakob Verbeek. It unites experts from
Grenoble's applied-math and computer science labs LJK, GIPSA-LAB and LIG in the
areas of computer vision, machine learning, speech, natural language
processing, and information retrieval. The purpose of DeCore is to stimulate
collaborative interdisciplinary research on deep learning in the Grenoble area,
which is likely to underpin future advances in machine perception (vision,
speech, text) over the next decade. It provides funding for two full PhD
students. Maha Elbayad is one of them, supervised by Jakob Verbeek and Laurant
Besacier (LIG, UGA).</p>
      </subsection>
      <subsection id="uid139" level="2">
        <bodyTitle>PEPS AMIES AuMalis POLLEN</bodyTitle>
        <participants>
          <person key="thoth-2018-idp115024">
            <firstname>Karteek</firstname>
            <lastname>Alahari</lastname>
          </person>
        </participants>
        <p>This is a collaborative project with POLLEN, a startup in the Grenoble area, which develops POLLEN Metrology, a software editor specialized in signal processing, hybrid metrology and machine learning for the automatic processing of heterogeneous data. This funding supports a postdoc to accelerate the introduction of artificial intelligence, and in particular computer vision, techniques, into the manufacture of new generation of microprocessors. Karteek Alahari and Valerie Perrier (LJK, UGA) jointly supervise a postdoc as part of this collaboration. This collaboration ended in 2019.
</p>
      </subsection>
    </subsection>
    <subsection id="uid140" level="1">
      <bodyTitle>National Initiatives</bodyTitle>
      <subsection id="uid141" level="2">
        <bodyTitle>ANR Project Macaron</bodyTitle>
        <participants>
          <person key="thoth-2018-idp112112">
            <firstname>Julien</firstname>
            <lastname>Mairal</lastname>
          </person>
          <person key="PASUSERID">
            <firstname>Zaid</firstname>
            <lastname>Harchaoui</lastname>
            <moreinfo>Univ. Washington</moreinfo>
          </person>
          <person key="PASUSERID">
            <firstname>Laurent</firstname>
            <lastname>Jacob</lastname>
            <moreinfo>CNRS, LBBE Laboratory</moreinfo>
          </person>
          <person key="PASUSERID">
            <firstname>Michael</firstname>
            <lastname>Blum</lastname>
            <moreinfo>CNRS, TIMC Laboratory</moreinfo>
          </person>
          <person key="PASUSERID">
            <firstname>Joseph</firstname>
            <lastname>Salmon</lastname>
            <moreinfo>Telecom ParisTech</moreinfo>
          </person>
          <person key="thoth-2018-idp142832">
            <firstname>Mikita</firstname>
            <lastname>Dvornik</lastname>
          </person>
          <person key="thoth-2018-idp184208">
            <firstname>Daan</firstname>
            <lastname>Wynen</lastname>
          </person>
        </participants>
        <p>The project MACARON is an endeavor to develop new mathematical and algorithmic
tools for making machine learning more scalable. Our ultimate goal is to use data for
solving scientific problems and automatically converting data into scientific
knowledge by using machine learning techniques. Therefore, our project has two
different axes, a methodological one, and an applied one driven by explicit
problems. The methodological axis addresses the limitations of current machine
learning for simultaneously dealing with large-scale data and huge models. The
second axis addresses open scientific problems in bioinformatics, computer
vision, image processing, and neuroscience, where a massive amount of data is
currently produced, and where huge-dimensional models yield similar
computational problems.</p>
        <p>This is a 4 years and half project, funded by ANR under the program “Jeunes
chercheurs, jeunes chercheuses”, which started in October 2014 and ended in March 2019. The
principal investigator is Julien Mairal.</p>
      </subsection>
      <subsection id="uid142" level="2">
        <bodyTitle>ANR Project DeepInFrance</bodyTitle>
        <participants>
          <person key="thoth-2018-idp122832">
            <firstname>Jakob</firstname>
            <lastname>Verbeek</lastname>
          </person>
          <person key="thoth-2018-idp128160">
            <firstname>Adria</firstname>
            <lastname>Ruiz Ovejero</lastname>
          </person>
        </participants>
        <p>DeepInFrance (Machine learning with deep neural networks) project also aims
at bringing together complementary machine learning, computer vision and
machine listening research groups working on deep learning with GPUs in
order to provide the community with the knowledge, the visibility and the
tools that brings France among the key players in deep learning. The
long-term vision of Deep in France is to open new frontiers and foster
research towards algorithms capable of discovering sense in data in an
automatic manner, a stepping stone before the more ambitious far-end goal
of machine reasoning. The project partners are: INSA Rouen, Univ. Caen,
Inria, UPMC, Aix-Marseille Univ., Univ. Nice Sophia Antipolis.</p>
      </subsection>
      <subsection id="uid143" level="2">
        <bodyTitle>ANR Project AVENUE</bodyTitle>
        <participants>
          <person key="thoth-2018-idp115024">
            <firstname>Karteek</firstname>
            <lastname>Alahari</lastname>
          </person>
        </participants>
        <p>This ANR project (started in October 2018) aims to address the
perception gap between human and artificial visual systems through a visual
memory network for human-like interpretation of scenes. To this end, we
address three scientific challenges. The first is to learn a network
representation of image, video and text data collections, to leverage their
inherent diverse cues. The second is to depart from supervised learning
paradigms, without compromising on the performance. The third one is to
perform inference with the learnt network, e.g., to estimate physical and
functional properties of objects, or give cautionary advice for navigating a
scene. The principal investigator is Karteek Alahari, and the project
involves participants from CentraleSupelec and Ecole des Ponts in Paris.</p>
      </subsection>
    </subsection>
    <subsection id="uid144" level="1">
      <bodyTitle>European Initiatives</bodyTitle>
      <subsection id="uid145" level="2">
        <bodyTitle>FP7 &amp; H2020 Projects</bodyTitle>
        <subsection id="uid146" level="3">
          <bodyTitle>ERC Advanced grant Allegro</bodyTitle>
          <participants>
            <person key="thoth-2018-idp119968">
              <firstname>Cordelia</firstname>
              <lastname>Schmid</lastname>
            </person>
            <person key="thoth-2018-idp172032">
              <firstname>Konstantin</firstname>
              <lastname>Shmelkov</lastname>
            </person>
            <person key="thoth-2018-idp174464">
              <firstname>Vladyslav</firstname>
              <lastname>Sydorov</lastname>
            </person>
            <person key="thoth-2018-idp184208">
              <firstname>Daan</firstname>
              <lastname>Wynen</lastname>
            </person>
            <person key="thoth-2018-idp142832">
              <firstname>Nikita</firstname>
              <lastname>Dvornik</lastname>
            </person>
            <person key="thoth-2018-idp191568">
              <firstname>Xavier</firstname>
              <lastname>Martin</lastname>
            </person>
          </participants>
          <p>The ERC advanced grant ALLEGRO started in April 2013 and will end in April
2019. The aim of ALLEGRO is to automatically learn from large quantities
of data with weak labels. A massive and ever growing amount of digital
image and video content is available today. It often comes with additional
information, such as text, audio or other meta-data, that forms a rather
sparse and noisy, yet rich and diverse source of annotation, ideally suited
to emerging weakly supervised and active machine learning technology. The
ALLEGRO project will take visual recognition to the next level by using this
largely untapped source of data to automatically learn visual models. We
will develop approaches capable of autonomously exploring evolving data
collections, selecting the relevant information, and determining the visual
models most appropriate for different object, scene, and activity
categories. An emphasis will be put on learning visual models from video, a
particularly rich source of information, and on the representation of human
activities, one of today's most challenging problems in computer vision.</p>
        </subsection>
        <subsection id="uid147" level="3">
          <bodyTitle>ERC Starting grant Solaris</bodyTitle>
          <participants>
            <person key="thoth-2018-idp112112">
              <firstname>Julien</firstname>
              <lastname>Mairal</lastname>
            </person>
            <person key="thoth-2018-idp186640">
              <firstname>Ghislain</firstname>
              <lastname>Durif</lastname>
            </person>
            <person key="thoth-2018-idp157424">
              <firstname>Andrei</firstname>
              <lastname>Kulunchakov</lastname>
            </person>
            <person key="thoth-2018-idp133056">
              <firstname>Alberto</firstname>
              <lastname>Bietti</lastname>
            </person>
            <person key="thoth-2018-idp137968">
              <firstname>Dexiong</firstname>
              <lastname>Chen</lastname>
            </person>
            <person key="thoth-2018-idp164736">
              <firstname>Gregoire</firstname>
              <lastname>Mialon</lastname>
            </person>
          </participants>
          <p>The project SOLARIS started in March 2017 for a duration of five years. The
goal of the project is to set up methodological and theoretical foundations
of deep learning models, in the context of large-scale data processing. The
main applications of the tools developed in this project are for processing
visual data, such as videos, but also structured data produced in
experimental sciences, such as biological sequences.</p>
          <p>The main paradigm used in the project is that of kernel methods and consist
of building functional spaces where deep learning models live. By doing so,
we want to derive theoretical properties of deep learning models that may
explain their success, and also obtain new tools with better stability
properties. Another work package of the project is focused on large-scale
optimization, which is a key to obtain fast learning algorithms.</p>
        </subsection>
      </subsection>
    </subsection>
    <subsection id="uid148" level="1">
      <bodyTitle>International Initiatives</bodyTitle>
      <subsection id="uid149" level="2">
        <bodyTitle>Inria International Labs</bodyTitle>
        <p>
          <b>Inria@EastCoast</b>
        </p>
        <p noindent="true">Associate Team involved in the International Lab:</p>
        <subsection id="uid150" level="3">
          <bodyTitle>
            <ref xlink:href="https://team.inria.fr/gaya/" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">GAYA </ref>
          </bodyTitle>
          <sanspuceslist>
            <li id="uid151">
              <p noindent="true">Title: Semantic and Geometric Models for Video Interpretation</p>
            </li>
            <li id="uid152">
              <p noindent="true">International Partner (Institution - Laboratory - Researcher):</p>
              <sanspuceslist>
                <li id="uid153">
                  <p noindent="true">Carnegie Mellon University (United States)
- Machine Learning Department - Katerina Fragkiadaki</p>
                </li>
              </sanspuceslist>
            </li>
            <li id="uid154">
              <p noindent="true">Start year: 2019</p>
            </li>
            <li id="uid155">
              <p noindent="true">See also: <ref xlink:href="https://team.inria.fr/gaya/" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>team.<allowbreak/>inria.<allowbreak/>fr/<allowbreak/>gaya/</ref></p>
            </li>
            <li id="uid156">
              <p noindent="true">We propose to renew the associate team GAYA, with the primary goal of interpreting videos in terms of recognizing actions, understanding the human-human and human-object interactions. In the first three years, the team has started addressing the problem of learning an efficient and robust video representation to attack this challenge. GAYA will now focus on building semantic models, wherein we learn incremental, joint audio-visual models, with limited supervision, and also geometric models, where we study the geometric properties of object shapes to better recognize them. The team consists of researchers from two Inria project-teams (Thoth and WILLOW), a US university (Carnegie Mellon University [CMU]) as the main partner team, and another US university (UC Berkeley) as a secondary partner. It will allow the partners to effectively combine their respective strengths in areas such as inference and machine learning approaches for vision tasks, joint audio-visual models, large-scale learning, geometric reasoning. The main expected outcomes of this collaboration are: new machine learning algorithms for handling minimally annotated multi-modal data, large-scale public datasets for benchmarking, theoretical analysis of objects shapes and contours. This associate team originally started in 2016, and was extended in 2019 for another 3 years.</p>
            </li>
          </sanspuceslist>
        </subsection>
      </subsection>
      <subsection id="uid157" level="2">
        <bodyTitle>Inria International Partners</bodyTitle>
        <subsection id="uid158" level="3">
          <bodyTitle>Informal International Partners</bodyTitle>
          <simplelist>
            <li id="uid159">
              <p noindent="true"><b>MPI Tübingen:</b> Cordelia Schmid collaborates with Michael Black, a research director at MPI, starting in 2013. End of 2015 she was award a Humbolt research award funding a long-term research project with colleagues at MPI. In 2019, the project resulted in the development of an approach for object interaction <ref xlink:href="#thoth-2019-bid22" location="biblio" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest"/>.</p>
            </li>
          </simplelist>
        </subsection>
      </subsection>
      <subsection id="uid160" level="2">
        <bodyTitle>Participation in Other International Programs</bodyTitle>
        <simplelist>
          <li id="uid161">
            <p noindent="true"><b>Indo-French project EVEREST</b> with IIIT Hyderabad, India, funded by CEFIPRA (Centre Franco-Indien pour la Promotion de la Recherche Avancee). The aim of this project between Cordelia Schmid, Karteek Alahari and C. V. Jawahar (IIIT Hyderabad) is to enable the use of rich, complex models that are required to address the challenges of high-level computer vision. The work plan for the project will follow three directions. First, we will develop a learning framework that can handle weak annotations. Second, we will build formulations to solve the non-convex optimization problem resulting from the learning framework. Third, we will develop efficient and accurate energy minimization algorithms, in order to make the optimization computationally feasible.</p>
          </li>
        </simplelist>
      </subsection>
    </subsection>
    <subsection id="uid162" level="1">
      <bodyTitle>International Research Visitors</bodyTitle>
      <subsection id="uid163" level="2">
        <bodyTitle>Visits of International Scientists</bodyTitle>
        <subsection id="uid164" level="3">
          <bodyTitle>Internships</bodyTitle>
          <simplelist>
            <li id="uid165">
              <p noindent="true">Pia Bideau (PhD Student, Univ. Massachusetts Amherst) was an intern in the team until Jan 2019.</p>
            </li>
            <li id="uid166">
              <p noindent="true">Avijit Dasgupta (PhD Student, IIIT Hyderabad, India) was an intern in the team from Feb to May 2019.</p>
            </li>
            <li id="uid167">
              <p noindent="true">Gunnar Sigurdsson (PhD student, CMU) was an intern in the team from Jan to Mar 2019.</p>
            </li>
          </simplelist>
        </subsection>
      </subsection>
    </subsection>
  </partenariat>
  <diffusion id="uid168">
    <bodyTitle>Dissemination</bodyTitle>
    <subsection id="uid169" level="1">
      <bodyTitle>Promoting Scientific Activities</bodyTitle>
      <subsection id="uid170" level="2">
        <bodyTitle>Scientific Events: Organisation</bodyTitle>
        <subsection id="uid171" level="3">
          <bodyTitle>General Chair, Scientific Chair</bodyTitle>
          <simplelist>
            <li id="uid172">
              <p noindent="true">C. Schmid is a general chair for ECCV 2020, ICCV 2023.</p>
            </li>
          </simplelist>
        </subsection>
        <subsection id="uid173" level="3">
          <bodyTitle>Member of the Organizing Committees</bodyTitle>
          <simplelist>
            <li id="uid174">
              <p noindent="true">K. Alahari and J. Mairal co-organized the international summer school PAISS 2019.</p>
            </li>
            <li id="uid175">
              <p noindent="true">J. Mairal is a member of the organizing committee for the international conference SIAM Imaging Science 2020.</p>
            </li>
            <li id="uid176">
              <p noindent="true">J. Mairal is a co-organizer of the workshop OSL'19 at Les Houches.</p>
            </li>
            <li id="uid177">
              <p noindent="true">J. Mairal co-organized a discussion session at the Ellis/Dali workshop, San Sebastian, 2019.</p>
            </li>
          </simplelist>
        </subsection>
      </subsection>
      <subsection id="uid178" level="2">
        <bodyTitle>Scientific Events: Selection</bodyTitle>
        <subsection id="uid179" level="3">
          <bodyTitle>Member of the Conference Program Committees</bodyTitle>
          <simplelist>
            <li id="uid180">
              <p noindent="true">K. Alahari: area chair for CVPR 2020, ICCV 2019.</p>
            </li>
            <li id="uid181">
              <p noindent="true">K. Alahari: senior program committee member for AAAI 2020, IJCAI 2019, IJCAI 2020.</p>
            </li>
            <li id="uid182">
              <p noindent="true">K. Alahari: doctoral consortium chair for ICCV 2023.</p>
            </li>
            <li id="uid183">
              <p noindent="true">J. Mairal: area chair for NeurIPS 2019, AISTATS 2020 and ECCV 2020.</p>
            </li>
            <li id="uid184">
              <p noindent="true">J. Mairal: tutorial chair for CVPR 2022.</p>
            </li>
            <li id="uid185">
              <p noindent="true">C. Schmid: area chair for ICCV 2019.</p>
            </li>
            <li id="uid186">
              <p noindent="true">C. Schmid: senior area chair for NeurIPS 2019.</p>
            </li>
            <li id="uid187">
              <p noindent="true">J. Verbeek: area chair for ICCV 2019.</p>
            </li>
          </simplelist>
        </subsection>
        <subsection id="uid188" level="3">
          <bodyTitle>Reviewer</bodyTitle>
          <p>The permanent members, postdocs and senior PhD students of the team reviewed numerous
papers for international conferences in artificial intelligence, computer
vision and machine learning, including AAAI, AISTATS, CVPR, ICCV, ICML, ICLR,
NeurIPS in 2019.</p>
        </subsection>
      </subsection>
      <subsection id="uid189" level="2">
        <bodyTitle>Journal</bodyTitle>
        <subsection id="uid190" level="3">
          <bodyTitle>Member of the Editorial Boards</bodyTitle>
          <simplelist>
            <li id="uid191">
              <p noindent="true">K. Alahari: Associate editor of the International Journal of Computer Vision, since 2019.</p>
            </li>
            <li id="uid192">
              <p noindent="true">K. Alahari: Associate editor for Computer Vision and Image Understanding journal, since 2018.</p>
            </li>
            <li id="uid193">
              <p noindent="true">J. Mairal: Associate editor of the Journal of Machine Learning Research (JMLR), since 2019.</p>
            </li>
            <li id="uid194">
              <p noindent="true">J. Mairal: Associate editor of the International Journal of Computer Vision, since 2015.</p>
            </li>
            <li id="uid195">
              <p noindent="true">J. Mairal: Associate editor of Journal of Mathematical Imaging and Vision, since 2015.</p>
            </li>
            <li id="uid196">
              <p noindent="true">J. Mairal: Associate editor of the SIAM Journal of Imaging Science, since 2018.</p>
            </li>
            <li id="uid197">
              <p noindent="true">J. Verbeek: Associate editor International Journal on Computer Vision, 2014-2019.</p>
            </li>
            <li id="uid198">
              <p noindent="true">J. Verbeek: Associate editor IEEE Transactions Pattern Analysis and Machine Intelligence, since 2018.</p>
            </li>
          </simplelist>
        </subsection>
        <subsection id="uid199" level="3">
          <bodyTitle>Reviewer - Reviewing Activities</bodyTitle>
          <p>The permanent members, postdocs and senior PhD students of the team reviewed
numerous papers for international journals in computer vision (IJCV, PAMI,
CVIU), machine learning (JMLR, Machine Learning). Some of them also
review for other reputed journals such as PLOS ONE, SIAM Journal on Optimization, SIAM Imaging Science.</p>
        </subsection>
      </subsection>
      <subsection id="uid200" level="2">
        <bodyTitle>Invited Talks</bodyTitle>
        <simplelist>
          <li id="uid201">
            <p noindent="true">K. Alahari: Speaker on the Panel on AI and Mathematics, Knowledge Summit, Lyon, France, 2019.</p>
          </li>
          <li id="uid202">
            <p noindent="true">K. Alahari: Invited talk, LIAMA workshop, Paris, France, 2019.</p>
          </li>
          <li id="uid203">
            <p noindent="true">A. Bietti: Invited talk, GIPSA Lab, Grenoble, 2019.</p>
          </li>
          <li id="uid204">
            <p noindent="true">A. Bietti: Invited talk, UC Berkeley, 2019.</p>
          </li>
          <li id="uid205">
            <p noindent="true">A. Bietti: Seminar, Microsoft Research AI, Redmond, 2019.</p>
          </li>
          <li id="uid206">
            <p noindent="true">A. Bietti: Seminar, TTI-Chicago, 2019.</p>
          </li>
          <li id="uid207">
            <p noindent="true">D. Chen: Machine Learning in Computational Biology (MLCB) workshop on recurrent kernel networks, Vancouver, 2019.</p>
          </li>
          <li id="uid208">
            <p noindent="true">J. Mairal: Invited talk at the YES workshop, Eindhoven, 2019.</p>
          </li>
          <li id="uid209">
            <p noindent="true">J. Mairal: Talk in mini-symposium, ICCOPT, Berlin, 2019.</p>
          </li>
          <li id="uid210">
            <p noindent="true">J. Mairal: Invited talk at the Imaging and Machine Learning conference, IHP, Paris, 2019.</p>
          </li>
          <li id="uid211">
            <p noindent="true">J. Mairal: Seminar. Centrale Lille, 2019.</p>
          </li>
          <li id="uid212">
            <p noindent="true">R. Klokov: Invited talk at Christmas Colloquium on Computer Vision, Yandex, Moscow, 2019.</p>
          </li>
          <li id="uid213">
            <p noindent="true">C. Schmid: Invited speaker at BMVA symposium in Video Understanding, London, September 2019.</p>
          </li>
          <li id="uid214">
            <p noindent="true">C. Schmid: Keynote speaker at BMVC, Cardiff, UK, September 2019.</p>
          </li>
          <li id="uid215">
            <p noindent="true">C. Schmid: Keynote speaker at SIGIR, Paris, July 2019.</p>
          </li>
          <li id="uid216">
            <p noindent="true">C. Schmid: Invited speaker at Computer Vision after 5 Years, in conjunction with CVPR, June 2019.</p>
          </li>
          <li id="uid217">
            <p noindent="true">C. Schmid: Invited speaker at Tutorial on Unifying Human Activity Understanding, in conjunction with CVPR, June 2019.</p>
          </li>
          <li id="uid218">
            <p noindent="true">C. Schmid: Invited speaker at Facebook AI Video Summit, June 2019.</p>
          </li>
          <li id="uid219">
            <p noindent="true">C. Schmid: Keynote speaker at AI Experts Workshop in conjunction with the AI for Good Global Summit, Geneva, May 2019.</p>
          </li>
          <li id="uid220">
            <p noindent="true">C. Schmid: Invited speaker at Women in Data Science Conference, Zürich, April 2019.</p>
          </li>
          <li id="uid221">
            <p noindent="true">C. Schmid: Invited speaker at Collège de France seminar (chair of Stephane Mallat), February 2019.</p>
          </li>
          <li id="uid222">
            <p noindent="true">C. Schmid: Talk at Google EMEA research days, Zurich, December 2019.</p>
          </li>
          <li id="uid223">
            <p noindent="true">C. Schmid: Talk at Workshop on AI for Robotics, Naver, Grenoble, November 2019.</p>
          </li>
          <li id="uid224">
            <p noindent="true">C. Schmid: Talk at Workshop, Robotics: A Challenge for the Artificial Intelligence, Toulouse, October 2019.</p>
          </li>
          <li id="uid225">
            <p noindent="true">C. Schmid: Presentation at PRAIRIE inauguration, Paris, October 2019.</p>
          </li>
          <li id="uid226">
            <p noindent="true">C. Schmid: Seminar at DeepMind, London, September 2019.</p>
          </li>
          <li id="uid227">
            <p noindent="true">C. Schmid: Seminar at Intel Network on Intelligent Systems, Munich, September 2019.</p>
          </li>
          <li id="uid228">
            <p noindent="true">C. Schmid: Seminar at Ellis workshop, September 2019.</p>
          </li>
          <li id="uid229">
            <p noindent="true">C. Schmid: Seminar at MPI Tübingen, July 2019.</p>
          </li>
          <li id="uid230">
            <p noindent="true">C. Schmid: Seminar at WILLOW/SIERRA retreat, Marseille, June 2019.</p>
          </li>
          <li id="uid231">
            <p noindent="true">C. Schmid: Dinner speaker at the workshop “Women in Computer Vision”, in conjunction with CVPR'19.</p>
          </li>
          <li id="uid232">
            <p noindent="true">C. Schmid: Seminar at Google MTV, April 2019.</p>
          </li>
          <li id="uid233">
            <p noindent="true">C. Schmid: Seminar at ETH Zürich, March 2019.</p>
          </li>
          <li id="uid234">
            <p noindent="true">J. Verbeek: Invited talk at Breaking the Surface Workshop on maritime robotics and its applications, Biograd na Moru, Croatia, Oct 2019.</p>
          </li>
          <li id="uid235">
            <p noindent="true">J. Verbeek: Invited talk at Dagstuhl Workshop on Joint Processing of Language and Visual Data for Better Automated Understanding, Germany, Jan 2019.</p>
          </li>
          <li id="uid236">
            <p noindent="true">D. Wynen: SMILE Reading Group Paris, 2019.</p>
          </li>
        </simplelist>
      </subsection>
      <subsection id="uid237" level="2">
        <bodyTitle>Leadership within the Scientific Community</bodyTitle>
        <simplelist>
          <li id="uid238">
            <p noindent="true">J. Mairal, J. Verbeek and C. Schmid became Ellis fellows.</p>
          </li>
          <li id="uid239">
            <p noindent="true">C. Schmid: Participation in a round table on AI, a technology for innovation, forum 5i, Grenoble, May 2019.</p>
          </li>
          <li id="uid240">
            <p noindent="true">C. Schmid: Animating several mentorship sessions at Women in Data Science Conference, Zürich, April 2019.</p>
          </li>
          <li id="uid241">
            <p noindent="true">C. Schmid: Mentor at the Doctoral Consortium, in conjunction with ICCV'19, CVPR'19.</p>
          </li>
          <li id="uid242">
            <p noindent="true">C. Schmid: Mentor for female PhD students at the workshop “Women in Computer Vision", CVPR'19.</p>
          </li>
        </simplelist>
      </subsection>
      <subsection id="uid243" level="2">
        <bodyTitle>Scientific Expertise</bodyTitle>
        <simplelist>
          <li id="uid244">
            <p noindent="true">J. Mairal: Judge for the IBM Watson AI Xprize.</p>
          </li>
          <li id="uid245">
            <p noindent="true">J. Mairal: Expert for ANR.</p>
          </li>
        </simplelist>
      </subsection>
      <subsection id="uid246" level="2">
        <bodyTitle>Research Administration</bodyTitle>
        <simplelist>
          <li id="uid247">
            <p noindent="true">K. Alahari: One of the two referents for Human Resources - Excellence in Research (HRS4R) at Inria Grenoble.</p>
          </li>
          <li id="uid248">
            <p noindent="true">J. Mairal: Jury member for the Inria starting and advanced research positions.</p>
          </li>
          <li id="uid249">
            <p noindent="true">C. Schmid: Member of Scientific Advisory Committee of the Helmholtz AI Cooperation Unit, 2020—</p>
          </li>
          <li id="uid250">
            <p noindent="true">C. Schmid: Member of scientific advisory board for the German Competence Centers for AI Research, 2019—</p>
          </li>
          <li id="uid251">
            <p noindent="true">J. Verbeek: Member steering committee MinaLogic, innovation cluster for digital technologies based in France's Auvergne-Rhône-Alpes region, 2018-2019.</p>
          </li>
          <li id="uid252">
            <p noindent="true">J. Verbeek: Scientific correspondent national project calls, Inria Grenoble, 2017-2019.</p>
          </li>
          <li id="uid253">
            <p noindent="true">J. Verbeek: Member Scientific council Advanced Data Mining axis of <ref xlink:href="https://persyval-lab.org/" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">Persyval Laboratory of Excellence</ref>, Grenoble, 2015-2019.</p>
          </li>
          <li id="uid254">
            <p noindent="true">J. Verbeek: Member Inria Grenoble working group on HPC - Big Data - Machine learning, 2018-2019.</p>
          </li>
          <li id="uid255">
            <p noindent="true">J. Verbeek: Member of Inria Commission Administrative Paritaire (advises on matters about individual careers: such as promotions, temporary outsourcing, etc.), 2016-2019.</p>
          </li>
        </simplelist>
      </subsection>
    </subsection>
    <subsection id="uid256" level="1">
      <bodyTitle>Teaching - Supervision - Juries</bodyTitle>
      <subsection id="uid257" level="2">
        <bodyTitle>Teaching</bodyTitle>
        <simplelist>
          <li id="uid258">
            <p noindent="true">Doctorat: K. Alahari, Lecturer at the CVIT summer school on machine learning, 4h eqTD, IIIT Hyderabad, India.</p>
          </li>
          <li id="uid259">
            <p noindent="true">Doctorat: J. Mairal, Large-Scale Optimization for Machine Learning, 9h eqTD, Lecture at OBA summer school, Veroli, 2019.</p>
          </li>
          <li id="uid260">
            <p noindent="true">Doctorat: J. Mairal, Large-Scale Optimization for Machine Learning, 4.5h eqTD. Invited Tutorial at IEEE Data Science Workshop on Minneapolis, 2019.</p>
          </li>
          <li id="uid261">
            <p noindent="true">Doctorat: J. Mairal, Large-Scale Optimization for Machine Learning, 4.5h eqTD. Invited Tutorial at the Quantitative BioImaging Conference, Rennes, 2019.</p>
          </li>
          <li id="uid262">
            <p noindent="true">Doctorat: C. Schmid, Course on action recognition at Data Science Summer School, Paris, June 2019.</p>
          </li>
          <li id="uid263">
            <p noindent="true">Doctorat: C. Schmid, Course on action recognition at Prairie artificial intelligence summer school (PAISS), 2.25h eqTD, Paris, October 2019.</p>
          </li>
          <li id="uid264">
            <p noindent="true">Master: K. Alahari, C. Schmid, Object recognition, Master-2 Computer Science, Grenoble University, 15.75h eqTD together, 2019.</p>
          </li>
          <li id="uid265">
            <p noindent="true">Master: K. Alahari, Understanding Big Visual Data, 13.5h eqTD, M2, Grenoble INP, France.</p>
          </li>
          <li id="uid266">
            <p noindent="true">Master: K. Alahari, Graphical Models Inference and Learning, 18h eqTD, M2, CentraleSupelec, Paris, France.</p>
          </li>
          <li id="uid267">
            <p noindent="true">Master: K. Alahari, Introduction to computer vision, 9h eqTD, M1, ENS Paris, France.</p>
          </li>
          <li id="uid268">
            <p noindent="true">Master: J. Mairal, Kernel methods for statistical learning, 15h eqTD, M2, Ecole Normale Supérieure, Cachan, France.</p>
          </li>
          <li id="uid269">
            <p noindent="true">Master: J. Mairal, Advanced Learning Models, 13.5h eqTD, M2, UGA, Grenoble.</p>
          </li>
          <li id="uid270">
            <p noindent="true">Master: C. Schmid, Object recognition and computer vision, Master-2 MVA, ENS, 9h eqTD, 2019.</p>
          </li>
          <li id="uid271">
            <p noindent="true">Master: A. Sablayrolles, Fundamentals of Machine Learning, African Masters of Machine Intelligence, Kigali, Rwanda.</p>
          </li>
        </simplelist>
      </subsection>
      <subsection id="uid272" level="2">
        <bodyTitle>Supervision</bodyTitle>
        <sanspuceslist>
          <li id="uid273">
            <p noindent="true">HDR: Karteek Alahari, Human, Motion and Other Priors for Partially-Supervised Recognition, Univ. Grenoble Alpes, 28/1/2019.</p>
          </li>
          <li id="uid274">
            <p noindent="true">PhD: Alberto Bietti, Foundations of deep convolutional models through kernel methods, Univ. Grenoble Alpes, 27/11/2019, director: Julien Mairal.</p>
          </li>
          <li id="uid275">
            <p noindent="true">PhD: Nikita Dvornik, Learning with Limited Annotated Data for Visual Understanding, Univ. Grenoble Alpes, 26/11/2019, thesis directors: Cordelia Schmid and Julien Mairal.</p>
          </li>
          <li id="uid276">
            <p noindent="true">PhD: Konstantin Shmelkov, Approaches for incremental learning and image generation, Univ. Grenoble Alpes, 29/3/2019, thesis directors: Karteek Alahari and Cordelia Schmid.</p>
          </li>
        </sanspuceslist>
      </subsection>
      <subsection id="uid277" level="2">
        <bodyTitle>Juries</bodyTitle>
        <simplelist>
          <li id="uid278">
            <p noindent="true">K. Alahari: External examiner for the PhD thesis of Alessandro di Martino, University of Bath, UK.</p>
          </li>
          <li id="uid279">
            <p noindent="true">K. Alahari: Examiner for the PhD thesis of Thomas Robert, Sorbonne Université, Paris, France.</p>
          </li>
          <li id="uid280">
            <p noindent="true">K. Alahari: Examiner for the PhD thesis of D. Khuê Lê-Huu, Université Paris-Saclay, France.</p>
          </li>
          <li id="uid281">
            <p noindent="true">K. Alahari: Member of comité de suivi for the PhD thesis of Miguel Angel Solinas, Univ. Grenoble-Alpes, France.</p>
          </li>
          <li id="uid282">
            <p noindent="true">J. Mairal: Reviewer for the PhD thesis of Zhenyu Liao, Université Paris-Saclay.</p>
          </li>
          <li id="uid283">
            <p noindent="true">J. Mairal: Reviewer for the PhD thesis of Belhal Karimi, Université Paris-Saclay</p>
          </li>
          <li id="uid284">
            <p noindent="true">J. Mairal: Reviewer for the PhD thesis of Martin Bompaire, Université Paris-Saclay</p>
          </li>
          <li id="uid285">
            <p noindent="true">J. Mairal: Reviewer for the PhD thesis of Yassine Yaakoubi, Polytechnique Montréal.</p>
          </li>
          <li id="uid286">
            <p noindent="true">J. Mairal: Examinateur for the PhD thesis of Mathurin Massias, Université Paris-Saclay.</p>
          </li>
          <li id="uid287">
            <p noindent="true">J. Mairal: Member of comité de suivi for the PhD thesis of Olga Permiakova, Univ. Grenoble Alpes.</p>
          </li>
          <li id="uid288">
            <p noindent="true">J. Verbeek: Member supervisory commitee for PhD of Riccardo Del Chiaro, 2018-2020, Univ. Florence, Italy.</p>
          </li>
          <li id="uid289">
            <p noindent="true">J. Verbeek: Member supervisory commitee for PhD of Fabien Baradel, 2017-2019, INSA Lyon, France.</p>
          </li>
          <li id="uid290">
            <p noindent="true">J. Verbeek: External reviewer for Shell Xu Hu, 2019, Ecole des Ponts, Paris Tech, Univ. Paris Est, Paris, France.</p>
          </li>
          <li id="uid291">
            <p noindent="true">J. Verbeek: Rapporteur for Hedi Ben Younes, 2019, Sorbonne University, Paris, France.</p>
          </li>
        </simplelist>
      </subsection>
    </subsection>
  </diffusion>
  <biblio id="bibliography" html="bibliography" numero="10" titre="Bibliography">
    
    <biblStruct id="thoth-2019-bid32" type="hdrthesis" rend="year" n="cite:alahari:tel-02269024">
      <identifiant type="hal" value="tel-02269024"/>
      <monogr>
        <title level="m">Human, Motion and Other Priors for Partially-Supervised Recognition</title>
        <author>
          <persName key="thoth-2018-idp115024">
            <foreName>Karteek</foreName>
            <surname>Alahari</surname>
            <initial>K.</initial>
          </persName>
        </author>
        <imprint>
          <publisher>
            <orgName type="school">Communauté Université Grenoble Alpes</orgName>
          </publisher>
          <dateStruct>
            <month>January</month>
            <year>2019</year>
          </dateStruct>
          <ref xlink:href="https://hal.inria.fr/tel-02269024" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>inria.<allowbreak/>fr/<allowbreak/>tel-02269024</ref>
        </imprint>
      </monogr>
      <note type="typdoc">Habilitation à diriger des recherches</note>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid38" type="phdthesis" rend="year" n="cite:shmelkov:tel-02183259">
      <identifiant type="hal" value="tel-02183259"/>
      <monogr>
        <title level="m">Approaches for incremental learning and image generation</title>
        <author>
          <persName key="thoth-2018-idp172032">
            <foreName>Konstantin</foreName>
            <surname>Shmelkov</surname>
            <initial>K.</initial>
          </persName>
        </author>
        <imprint>
          <publisher>
            <orgName type="school">Université Grenoble Alpes</orgName>
          </publisher>
          <dateStruct>
            <month>March</month>
            <year>2019</year>
          </dateStruct>
          <ref xlink:href="https://tel.archives-ouvertes.fr/tel-02183259" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>tel.<allowbreak/>archives-ouvertes.<allowbreak/>fr/<allowbreak/>tel-02183259</ref>
        </imprint>
      </monogr>
      <note type="typdoc">Theses</note>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid14" type="article" rend="year" n="cite:bietti:hal-01536004">
      <identifiant type="hal" value="hal-01536004"/>
      <analytic>
        <title level="a">Group Invariance, Stability to Deformations, and Complexity of Deep Convolutional Representations</title>
        <author>
          <persName key="thoth-2018-idp133056">
            <foreName>Alberto</foreName>
            <surname>Bietti</surname>
            <initial>A.</initial>
          </persName>
          <persName key="thoth-2018-idp112112">
            <foreName>Julien</foreName>
            <surname>Mairal</surname>
            <initial>J.</initial>
          </persName>
        </author>
      </analytic>
      <monogr x-scientific-popularization="no" x-editorial-board="yes" x-international-audience="yes" id="rid01187">
        <idno type="issn">1532-4435</idno>
        <title level="j">Journal of Machine Learning Research</title>
        <imprint>
          <biblScope type="volume">20</biblScope>
          <biblScope type="number">1</biblScope>
          <dateStruct>
            <year>2019</year>
          </dateStruct>
          <biblScope type="pages">1-49</biblScope>
          <ref xlink:href="https://hal.inria.fr/hal-01536004" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>inria.<allowbreak/>fr/<allowbreak/>hal-01536004</ref>
        </imprint>
      </monogr>
      <note type="bnote">
        <ref xlink:href="https://arxiv.org/abs/1706.03078" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>arxiv.<allowbreak/>org/<allowbreak/>abs/<allowbreak/>1706.<allowbreak/>03078</ref>
      </note>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid18" type="article" rend="year" n="cite:chen:hal-01632912">
      <identifiant type="doi" value="10.1093/bioinformatics/btz094"/>
      <identifiant type="hal" value="hal-01632912"/>
      <analytic>
        <title level="a">Biological Sequence Modeling with Convolutional Kernel Networks</title>
        <author>
          <persName key="thoth-2018-idp137968">
            <foreName>Dexiong</foreName>
            <surname>Chen</surname>
            <initial>D.</initial>
          </persName>
          <persName key="erable-2018-idp209824">
            <foreName>Laurent</foreName>
            <surname>Jacob</surname>
            <initial>L.</initial>
          </persName>
          <persName key="thoth-2018-idp112112">
            <foreName>Julien</foreName>
            <surname>Mairal</surname>
            <initial>J.</initial>
          </persName>
        </author>
      </analytic>
      <monogr x-scientific-popularization="no" x-editorial-board="yes" x-international-audience="yes" id="rid00243">
        <idno type="issn">1367-4803</idno>
        <title level="j">Bioinformatics</title>
        <imprint>
          <biblScope type="volume">35</biblScope>
          <biblScope type="number">18</biblScope>
          <dateStruct>
            <month>September</month>
            <year>2019</year>
          </dateStruct>
          <biblScope type="pages">3294–3302</biblScope>
          <ref xlink:href="https://hal.inria.fr/hal-01632912" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>inria.<allowbreak/>fr/<allowbreak/>hal-01632912</ref>
        </imprint>
      </monogr>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid2" type="article" rend="year" n="cite:derkach:hal-02267568">
      <identifiant type="doi" value="10.1007/s11263-019-01208-x"/>
      <identifiant type="hal" value="hal-02267568"/>
      <analytic>
        <title level="a">Tensor Decomposition and Non-linear Manifold Modeling for 3D Head Pose Estimation</title>
        <author>
          <persName>
            <foreName>Dmytro</foreName>
            <surname>Derkach</surname>
            <initial>D.</initial>
          </persName>
          <persName>
            <foreName>Adrià</foreName>
            <surname>Ruiz</surname>
            <initial>A.</initial>
          </persName>
          <persName>
            <foreName>Federico M</foreName>
            <surname>Sukno</surname>
            <initial>F. M.</initial>
          </persName>
        </author>
      </analytic>
      <monogr x-scientific-popularization="no" x-editorial-board="yes" x-international-audience="yes" id="rid00880">
        <idno type="issn">0920-5691</idno>
        <title level="j">International Journal of Computer Vision</title>
        <imprint>
          <biblScope type="volume">127</biblScope>
          <biblScope type="number">10</biblScope>
          <dateStruct>
            <month>October</month>
            <year>2019</year>
          </dateStruct>
          <biblScope type="pages">1565-1585</biblScope>
          <ref xlink:href="https://hal.archives-ouvertes.fr/hal-02267568" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>archives-ouvertes.<allowbreak/>fr/<allowbreak/>hal-02267568</ref>
        </imprint>
      </monogr>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid28" type="article" rend="year" n="cite:durif:hal-01649275">
      <identifiant type="doi" value="10.1093/bioinformatics/btz177"/>
      <identifiant type="hal" value="hal-01649275"/>
      <analytic>
        <title level="a">Probabilistic Count Matrix Factorization for Single Cell Expression Data Analysis</title>
        <author>
          <persName key="thoth-2018-idp186640">
            <foreName>Ghislain</foreName>
            <surname>Durif</surname>
            <initial>G.</initial>
          </persName>
          <persName>
            <foreName>Laurent</foreName>
            <surname>Modolo</surname>
            <initial>L.</initial>
          </persName>
          <persName>
            <foreName>Jeff E</foreName>
            <surname>Mold</surname>
            <initial>J. E.</initial>
          </persName>
          <persName>
            <foreName>Sophie</foreName>
            <surname>Lambert-Lacroix</surname>
            <initial>S.</initial>
          </persName>
          <persName>
            <foreName>Franck</foreName>
            <surname>Picard</surname>
            <initial>F.</initial>
          </persName>
        </author>
      </analytic>
      <monogr x-scientific-popularization="no" x-editorial-board="yes" x-international-audience="yes" id="rid00243">
        <idno type="issn">1367-4803</idno>
        <title level="j">Bioinformatics</title>
        <imprint>
          <biblScope type="volume">20</biblScope>
          <dateStruct>
            <month>October</month>
            <year>2019</year>
          </dateStruct>
          <biblScope type="pages">4011–4019</biblScope>
          <ref xlink:href="https://hal.archives-ouvertes.fr/hal-01649275" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>archives-ouvertes.<allowbreak/>fr/<allowbreak/>hal-01649275</ref>
        </imprint>
      </monogr>
      <note type="bnote">
        <ref xlink:href="https://arxiv.org/abs/1710.11028" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>arxiv.<allowbreak/>org/<allowbreak/>abs/<allowbreak/>1710.<allowbreak/>11028</ref>
      </note>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid29" subtype="nonparu-d" type="article" rend="year" n="cite:dvornik:hal-01869784">
      <identifiant type="doi" value="10.1109/TPAMI.2019.2961896"/>
      <identifiant type="hal" value="hal-01869784"/>
      <analytic>
        <title level="a">On the Importance of Visual Context for Data Augmentation in Scene Understanding</title>
        <author>
          <persName>
            <foreName>Nikita</foreName>
            <surname>Dvornik</surname>
            <initial>N.</initial>
          </persName>
          <persName key="thoth-2018-idp112112">
            <foreName>Julien</foreName>
            <surname>Mairal</surname>
            <initial>J.</initial>
          </persName>
          <persName key="thoth-2018-idp119968">
            <foreName>Cordelia</foreName>
            <surname>Schmid</surname>
            <initial>C.</initial>
          </persName>
        </author>
      </analytic>
      <monogr x-scientific-popularization="no" x-editorial-board="yes" x-international-audience="yes" id="rid00747">
        <idno type="issn">0162-8828</idno>
        <title level="j">IEEE Transactions on Pattern Analysis and Machine Intelligence</title>
        <imprint>
          <dateStruct>
            <month>December</month>
            <year>2019</year>
          </dateStruct>
          <biblScope type="pages">1-15</biblScope>
          <ref xlink:href="https://hal.archives-ouvertes.fr/hal-01869784" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>archives-ouvertes.<allowbreak/>fr/<allowbreak/>hal-01869784</ref>
        </imprint>
      </monogr>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid39" type="article" rend="year" n="cite:lin:hal-01376079">
      <identifiant type="doi" value="10.1137/17M1125157"/>
      <identifiant type="hal" value="hal-01376079"/>
      <analytic>
        <title level="a">An Inexact Variable Metric Proximal Point Algorithm for Generic Quasi-Newton Acceleration</title>
        <author>
          <persName>
            <foreName>Hongzhou</foreName>
            <surname>Lin</surname>
            <initial>H.</initial>
          </persName>
          <persName key="thoth-2018-idp112112">
            <foreName>Julien</foreName>
            <surname>Mairal</surname>
            <initial>J.</initial>
          </persName>
          <persName>
            <foreName>Zaid</foreName>
            <surname>Harchaoui</surname>
            <initial>Z.</initial>
          </persName>
        </author>
      </analytic>
      <monogr x-scientific-popularization="no" x-editorial-board="yes" x-international-audience="yes" id="rid01738">
        <idno type="issn">1052-6234</idno>
        <title level="j">SIAM Journal on Optimization</title>
        <imprint>
          <biblScope type="volume">29</biblScope>
          <biblScope type="number">2</biblScope>
          <dateStruct>
            <month>May</month>
            <year>2019</year>
          </dateStruct>
          <biblScope type="pages">1408-1443</biblScope>
          <ref xlink:href="https://hal.inria.fr/hal-01376079" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>inria.<allowbreak/>fr/<allowbreak/>hal-01376079</ref>
        </imprint>
      </monogr>
      <note type="bnote">
        <ref xlink:href="https://arxiv.org/abs/1610.00960" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>arxiv.<allowbreak/>org/<allowbreak/>abs/<allowbreak/>1610.<allowbreak/>00960</ref>
      </note>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid30" subtype="nonparu-d" type="article" rend="year" n="cite:rogez:hal-01961189">
      <identifiant type="doi" value="10.1109/TPAMI.2019.2892985"/>
      <identifiant type="hal" value="hal-01961189"/>
      <analytic>
        <title level="a">LCR-Net++: Multi-person 2D and 3D Pose Detection in Natural Images</title>
        <author>
          <persName key="thoth-2018-idp117488">
            <foreName>Gregory</foreName>
            <surname>Rogez</surname>
            <initial>G.</initial>
          </persName>
          <persName>
            <foreName>Philippe</foreName>
            <surname>Weinzaepfel</surname>
            <initial>P.</initial>
          </persName>
          <persName key="thoth-2018-idp119968">
            <foreName>Cordelia</foreName>
            <surname>Schmid</surname>
            <initial>C.</initial>
          </persName>
        </author>
      </analytic>
      <monogr x-scientific-popularization="no" x-editorial-board="yes" x-international-audience="yes" id="rid00747">
        <idno type="issn">0162-8828</idno>
        <title level="j">IEEE Transactions on Pattern Analysis and Machine Intelligence</title>
        <imprint>
          <dateStruct>
            <year>2019</year>
          </dateStruct>
          <biblScope type="pages">1-15</biblScope>
          <ref xlink:href="https://hal.archives-ouvertes.fr/hal-01961189" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>archives-ouvertes.<allowbreak/>fr/<allowbreak/>hal-01961189</ref>
        </imprint>
      </monogr>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid40" type="article" rend="year" n="cite:tokmakov:hal-01653720">
      <identifiant type="doi" value="10.1007/s11263-018-1122-2"/>
      <identifiant type="hal" value="hal-01653720"/>
      <analytic>
        <title level="a">Learning to Segment Moving Objects</title>
        <author>
          <persName key="thoth-2018-idp176896">
            <foreName>Pavel</foreName>
            <surname>Tokmakov</surname>
            <initial>P.</initial>
          </persName>
          <persName key="thoth-2018-idp119968">
            <foreName>Cordelia</foreName>
            <surname>Schmid</surname>
            <initial>C.</initial>
          </persName>
          <persName key="thoth-2018-idp115024">
            <foreName>Karteek</foreName>
            <surname>Alahari</surname>
            <initial>K.</initial>
          </persName>
        </author>
      </analytic>
      <monogr x-scientific-popularization="no" x-editorial-board="yes" x-international-audience="yes" id="rid00880">
        <idno type="issn">0920-5691</idno>
        <title level="j">International Journal of Computer Vision</title>
        <imprint>
          <biblScope type="volume">127</biblScope>
          <biblScope type="number">3</biblScope>
          <dateStruct>
            <month>March</month>
            <year>2019</year>
          </dateStruct>
          <biblScope type="pages">282–301</biblScope>
          <ref xlink:href="https://hal.archives-ouvertes.fr/hal-01653720" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>archives-ouvertes.<allowbreak/>fr/<allowbreak/>hal-01653720</ref>
        </imprint>
      </monogr>
      <note type="bnote">
        <ref xlink:href="https://arxiv.org/abs/1712.01127" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>arxiv.<allowbreak/>org/<allowbreak/>abs/<allowbreak/>1712.<allowbreak/>01127</ref>
      </note>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid33" type="inproceedings" rend="year" n="cite:bietti:hal-02144221">
      <identifiant type="hal" value="hal-02144221"/>
      <analytic>
        <title level="a">On the Inductive Bias of Neural Tangent Kernels</title>
        <author>
          <persName key="thoth-2018-idp133056">
            <foreName>Alberto</foreName>
            <surname>Bietti</surname>
            <initial>A.</initial>
          </persName>
          <persName key="thoth-2018-idp112112">
            <foreName>Julien</foreName>
            <surname>Mairal</surname>
            <initial>J.</initial>
          </persName>
        </author>
      </analytic>
      <monogr x-scientific-popularization="no" x-international-audience="yes" x-proceedings="yes" x-invited-conference="no" x-editorial-board="yes">
        <title level="m">NeurIPS 2019 - Thirty-third Conference on Neural Information Processing Systems</title>
        <loc>Vancouver, Canada</loc>
        <imprint>
          <dateStruct>
            <month>December</month>
            <year>2019</year>
          </dateStruct>
          <biblScope type="pages">1-24</biblScope>
          <ref xlink:href="https://hal.inria.fr/hal-02144221" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>inria.<allowbreak/>fr/<allowbreak/>hal-02144221</ref>
        </imprint>
        <meeting id="cid29560">
          <title>Annual Conference on Neural Information Processing Systems</title>
          <num>33</num>
          <abbr type="sigle">NIPS</abbr>
        </meeting>
      </monogr>
      <note type="bnote">
        <ref xlink:href="https://arxiv.org/abs/1905.12173" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>arxiv.<allowbreak/>org/<allowbreak/>abs/<allowbreak/>1905.<allowbreak/>12173</ref>
      </note>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid15" type="inproceedings" rend="year" n="cite:bietti:hal-01884632">
      <identifiant type="hal" value="hal-01884632"/>
      <analytic>
        <title level="a">A Kernel Perspective for Regularizing Deep Neural Networks</title>
        <author>
          <persName key="thoth-2018-idp133056">
            <foreName>Alberto</foreName>
            <surname>Bietti</surname>
            <initial>A.</initial>
          </persName>
          <persName key="thoth-2018-idp164736">
            <foreName>Grégoire</foreName>
            <surname>Mialon</surname>
            <initial>G.</initial>
          </persName>
          <persName key="thoth-2018-idp137968">
            <foreName>Dexiong</foreName>
            <surname>Chen</surname>
            <initial>D.</initial>
          </persName>
          <persName key="thoth-2018-idp112112">
            <foreName>Julien</foreName>
            <surname>Mairal</surname>
            <initial>J.</initial>
          </persName>
        </author>
      </analytic>
      <monogr x-scientific-popularization="no" x-international-audience="yes" x-proceedings="yes" x-invited-conference="no" x-editorial-board="yes">
        <title level="m">ICML 2019 - 36th International Conference on Machine Learning</title>
        <loc>Long Beach, United States</loc>
        <title level="s">Proceedings of Machine Learning Research</title>
        <imprint>
          <biblScope type="volume">97</biblScope>
          <dateStruct>
            <month>June</month>
            <year>2019</year>
          </dateStruct>
          <biblScope type="pages">664-674</biblScope>
          <ref xlink:href="https://hal.inria.fr/hal-01884632" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>inria.<allowbreak/>fr/<allowbreak/>hal-01884632</ref>
        </imprint>
        <meeting id="cid32516">
          <title>International Conference on Machine Learning</title>
          <num>36</num>
          <abbr type="sigle">ICML</abbr>
        </meeting>
      </monogr>
      <note type="bnote">
        <ref xlink:href="https://arxiv.org/abs/1810.00363" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>arxiv.<allowbreak/>org/<allowbreak/>abs/<allowbreak/>1810.<allowbreak/>00363</ref>
      </note>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid5" type="inproceedings" rend="year" n="cite:caron:hal-02119564">
      <identifiant type="hal" value="hal-02119564"/>
      <analytic>
        <title level="a">Unsupervised Pre-Training of Image Features on Non-Curated Data</title>
        <author>
          <persName key="thoth-2018-idp135488">
            <foreName>Mathilde</foreName>
            <surname>Caron</surname>
            <initial>M.</initial>
          </persName>
          <persName>
            <foreName>Piotr</foreName>
            <surname>Bojanowski</surname>
            <initial>P.</initial>
          </persName>
          <persName key="thoth-2018-idp112112">
            <foreName>Julien</foreName>
            <surname>Mairal</surname>
            <initial>J.</initial>
          </persName>
          <persName>
            <foreName>Armand</foreName>
            <surname>Joulin</surname>
            <initial>A.</initial>
          </persName>
        </author>
      </analytic>
      <monogr x-scientific-popularization="no" x-international-audience="yes" x-proceedings="yes" x-invited-conference="no" x-editorial-board="yes">
        <title level="m">ICCV 2019 - International Conference on Computer Vision</title>
        <loc>Seoul, South Korea</loc>
        <title level="s">Proceedings of the International Conference on Computer Vision (ICCV)</title>
        <imprint>
          <dateStruct>
            <month>October</month>
            <year>2019</year>
          </dateStruct>
          <biblScope type="pages">1-10</biblScope>
          <ref xlink:href="https://hal.archives-ouvertes.fr/hal-02119564" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>archives-ouvertes.<allowbreak/>fr/<allowbreak/>hal-02119564</ref>
        </imprint>
        <meeting id="cid82250">
          <title>IEEE International Conference on Computer Vision</title>
          <num>2019</num>
          <abbr type="sigle">ICCV</abbr>
        </meeting>
      </monogr>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid19" type="inproceedings" rend="year" n="cite:chen:hal-02388776">
      <identifiant type="doi" value="10.1007/978-3-030-17083-7"/>
      <identifiant type="hal" value="hal-02388776"/>
      <analytic>
        <title level="a">Biological Sequence Modeling with Convolutional Kernel Networks</title>
        <author>
          <persName key="thoth-2018-idp137968">
            <foreName>Dexiong</foreName>
            <surname>Chen</surname>
            <initial>D.</initial>
          </persName>
          <persName key="erable-2018-idp209824">
            <foreName>Laurent</foreName>
            <surname>Jacob</surname>
            <initial>L.</initial>
          </persName>
          <persName key="thoth-2018-idp112112">
            <foreName>Julien</foreName>
            <surname>Mairal</surname>
            <initial>J.</initial>
          </persName>
        </author>
      </analytic>
      <monogr x-scientific-popularization="no" x-international-audience="yes" x-proceedings="yes" x-invited-conference="no" x-editorial-board="yes">
        <title level="m">RECOMB 2019 - 23rd Annual International Conference Research in Computational Molecular Biology</title>
        <loc>Washington DC, United States</loc>
        <imprint>
          <publisher>
            <orgName>Springer</orgName>
          </publisher>
          <dateStruct>
            <month>May</month>
            <year>2019</year>
          </dateStruct>
          <biblScope type="pages">1-2</biblScope>
          <ref xlink:href="https://hal.archives-ouvertes.fr/hal-02388776" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>archives-ouvertes.<allowbreak/>fr/<allowbreak/>hal-02388776</ref>
        </imprint>
        <meeting id="cid32695">
          <title>Annual International Conference on Research in Computational Molecular Biology</title>
          <num>2019</num>
          <abbr type="sigle">RECOMB</abbr>
        </meeting>
      </monogr>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid20" type="inproceedings" rend="year" n="cite:chen:hal-02151135">
      <identifiant type="hal" value="hal-02151135"/>
      <analytic>
        <title level="a">Recurrent Kernel Networks</title>
        <author>
          <persName key="thoth-2018-idp137968">
            <foreName>Dexiong</foreName>
            <surname>Chen</surname>
            <initial>D.</initial>
          </persName>
          <persName key="erable-2018-idp209824">
            <foreName>Laurent</foreName>
            <surname>Jacob</surname>
            <initial>L.</initial>
          </persName>
          <persName key="thoth-2018-idp112112">
            <foreName>Julien</foreName>
            <surname>Mairal</surname>
            <initial>J.</initial>
          </persName>
        </author>
      </analytic>
      <monogr x-scientific-popularization="no" x-international-audience="yes" x-proceedings="yes" x-invited-conference="no" x-editorial-board="yes">
        <title level="m">NeurIPS 2019 - Thirty-third Conference Neural Information Processing Systems</title>
        <loc>Vancouver, Canada</loc>
        <imprint>
          <dateStruct>
            <month>December</month>
            <year>2019</year>
          </dateStruct>
          <biblScope type="pages">1-19</biblScope>
          <ref xlink:href="https://hal.inria.fr/hal-02151135" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>inria.<allowbreak/>fr/<allowbreak/>hal-02151135</ref>
        </imprint>
        <meeting id="cid29560">
          <title>Annual Conference on Neural Information Processing Systems</title>
          <num>33</num>
          <abbr type="sigle">NIPS</abbr>
        </meeting>
      </monogr>
      <note type="bnote">
        <ref xlink:href="https://arxiv.org/abs/1906.03200" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>arxiv.<allowbreak/>org/<allowbreak/>abs/<allowbreak/>1906.<allowbreak/>03200</ref>
      </note>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid23" type="inproceedings" rend="year" n="cite:crasto:hal-02140558">
      <identifiant type="hal" value="hal-02140558"/>
      <analytic>
        <title level="a">MARS: Motion-Augmented RGB Stream for Action Recognition</title>
        <author>
          <persName key="thoth-2018-idp198976">
            <foreName>Nieves</foreName>
            <surname>Crasto</surname>
            <initial>N.</initial>
          </persName>
          <persName>
            <foreName>Philippe</foreName>
            <surname>Weinzaepfel</surname>
            <initial>P.</initial>
          </persName>
          <persName key="thoth-2018-idp115024">
            <foreName>Karteek</foreName>
            <surname>Alahari</surname>
            <initial>K.</initial>
          </persName>
          <persName key="thoth-2018-idp119968">
            <foreName>Cordelia</foreName>
            <surname>Schmid</surname>
            <initial>C.</initial>
          </persName>
        </author>
      </analytic>
      <monogr x-scientific-popularization="no" x-international-audience="yes" x-proceedings="yes" x-invited-conference="no" x-editorial-board="yes">
        <title level="m">CVPR 2019 - IEEE Conference on Computer Vision &amp; Pattern Recognition</title>
        <loc>Long Beach, CA, United States</loc>
        <imprint>
          <publisher>
            <orgName>IEEE</orgName>
          </publisher>
          <dateStruct>
            <month>June</month>
            <year>2019</year>
          </dateStruct>
          <biblScope type="pages">1-10</biblScope>
          <ref xlink:href="https://hal.inria.fr/hal-02140558" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>inria.<allowbreak/>fr/<allowbreak/>hal-02140558</ref>
        </imprint>
        <meeting id="cid82398">
          <title>IEEE International Conference on Computer Vision and Pattern Recognition</title>
          <num>2019</num>
          <abbr type="sigle">CVPR</abbr>
        </meeting>
      </monogr>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid4" type="inproceedings" rend="year" n="cite:dvornik:hal-02080004">
      <identifiant type="hal" value="hal-02080004"/>
      <analytic>
        <title level="a">Diversity with Cooperation: Ensemble Methods for Few-Shot Classification</title>
        <author>
          <persName>
            <foreName>Nikita</foreName>
            <surname>Dvornik</surname>
            <initial>N.</initial>
          </persName>
          <persName key="thoth-2018-idp119968">
            <foreName>Cordelia</foreName>
            <surname>Schmid</surname>
            <initial>C.</initial>
          </persName>
          <persName key="thoth-2018-idp112112">
            <foreName>Julien</foreName>
            <surname>Mairal</surname>
            <initial>J.</initial>
          </persName>
        </author>
      </analytic>
      <monogr x-scientific-popularization="no" x-international-audience="yes" x-proceedings="yes" x-invited-conference="no" x-editorial-board="yes">
        <title level="m">ICCV 2019 - International Conference on Computer Vision</title>
        <loc>Seoul, South Korea</loc>
        <imprint>
          <dateStruct>
            <month>October</month>
            <year>2019</year>
          </dateStruct>
          <biblScope type="pages">1-12</biblScope>
          <ref xlink:href="https://hal.archives-ouvertes.fr/hal-02080004" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>archives-ouvertes.<allowbreak/>fr/<allowbreak/>hal-02080004</ref>
        </imprint>
        <meeting id="cid82250">
          <title>IEEE International Conference on Computer Vision</title>
          <num>2019</num>
          <abbr type="sigle">ICCV</abbr>
        </meeting>
      </monogr>
      <note type="bnote"><ref xlink:href="https://arxiv.org/abs/1903.11341" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>arxiv.<allowbreak/>org/<allowbreak/>abs/<allowbreak/>1903.<allowbreak/>11341</ref> - Added experiments with different network architectures and input image resolutions</note>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid21" type="inproceedings" rend="year" n="cite:elbayad:hal-02422914">
      <identifiant type="hal" value="hal-02422914"/>
      <analytic>
        <title level="a">Depth-adaptive Transformer</title>
        <author>
          <persName key="thoth-2018-idp145264">
            <foreName>Maha</foreName>
            <surname>Elbayad</surname>
            <initial>M.</initial>
          </persName>
          <persName>
            <foreName>Jiatao</foreName>
            <surname>Gu</surname>
            <initial>J.</initial>
          </persName>
          <persName>
            <foreName>Edouard</foreName>
            <surname>Grave</surname>
            <initial>E.</initial>
          </persName>
          <persName>
            <foreName>Michael</foreName>
            <surname>Auli</surname>
            <initial>M.</initial>
          </persName>
        </author>
      </analytic>
      <monogr x-scientific-popularization="no" x-international-audience="yes" x-proceedings="yes" x-invited-conference="no" x-editorial-board="yes">
        <title level="m">ICLR 2020 - Eighth International Conference on Learning Representations</title>
        <loc>Addis Ababa, Ethiopia</loc>
        <imprint>
          <dateStruct>
            <month>December</month>
            <year>2019</year>
          </dateStruct>
          <biblScope type="pages">1-14</biblScope>
          <ref xlink:href="https://hal.inria.fr/hal-02422914" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>inria.<allowbreak/>fr/<allowbreak/>hal-02422914</ref>
        </imprint>
        <meeting id="cid624026">
          <title>International Conference on Learning Representations</title>
          <num>8</num>
          <abbr type="sigle">ICLR</abbr>
        </meeting>
      </monogr>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid9" type="inproceedings" rend="year" n="cite:gabeur:hal-02242795">
      <identifiant type="hal" value="hal-02242795"/>
      <analytic>
        <title level="a">Moulding Humans: Non-parametric 3D Human Shape Estimation from Single Images</title>
        <author>
          <persName key="thoth-2018-idp147696">
            <foreName>Valentin</foreName>
            <surname>Gabeur</surname>
            <initial>V.</initial>
          </persName>
          <persName key="morpheo-2019-idp154096">
            <foreName>Jean-Sébastien</foreName>
            <surname>Franco</surname>
            <initial>J.-S.</initial>
          </persName>
          <persName key="thoth-2018-idp191568">
            <foreName>Xavier</foreName>
            <surname>Martin</surname>
            <initial>X.</initial>
          </persName>
          <persName key="thoth-2018-idp119968">
            <foreName>Cordelia</foreName>
            <surname>Schmid</surname>
            <initial>C.</initial>
          </persName>
          <persName key="thoth-2018-idp117488">
            <foreName>Gregory</foreName>
            <surname>Rogez</surname>
            <initial>G.</initial>
          </persName>
        </author>
      </analytic>
      <monogr x-scientific-popularization="no" x-international-audience="yes" x-proceedings="yes" x-invited-conference="no" x-editorial-board="yes">
        <title level="m">ICCV 2019 - International Conference on Computer Vision</title>
        <loc>Seoul, South Korea</loc>
        <imprint>
          <dateStruct>
            <month>October</month>
            <year>2019</year>
          </dateStruct>
          <biblScope type="pages">1-10</biblScope>
          <ref xlink:href="https://hal.inria.fr/hal-02242795" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>inria.<allowbreak/>fr/<allowbreak/>hal-02242795</ref>
        </imprint>
        <meeting id="cid82250">
          <title>IEEE International Conference on Computer Vision</title>
          <num>2019</num>
          <abbr type="sigle">ICCV</abbr>
        </meeting>
      </monogr>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid22" type="inproceedings" rend="year" n="cite:hasson:hal-02429093">
      <identifiant type="hal" value="hal-02429093"/>
      <analytic>
        <title level="a">Learning joint reconstruction of hands and manipulated objects</title>
        <author>
          <persName key="thoth-2018-idp150128">
            <foreName>Yana</foreName>
            <surname>Hasson</surname>
            <initial>Y.</initial>
          </persName>
          <persName key="thoth-2018-idp179328">
            <foreName>Gül</foreName>
            <surname>Varol</surname>
            <initial>G.</initial>
          </persName>
          <persName>
            <foreName>Dimitrios</foreName>
            <surname>Tzionas</surname>
            <initial>D.</initial>
          </persName>
          <persName key="willow-2018-idp174496">
            <foreName>Igor</foreName>
            <surname>Kalevatykh</surname>
            <initial>I.</initial>
          </persName>
          <persName>
            <foreName>Michael J</foreName>
            <surname>Black</surname>
            <initial>M. J.</initial>
          </persName>
          <persName key="willow-2018-idp114960">
            <foreName>Ivan</foreName>
            <surname>Laptev</surname>
            <initial>I.</initial>
          </persName>
          <persName key="thoth-2018-idp119968">
            <foreName>Cordelia</foreName>
            <surname>Schmid</surname>
            <initial>C.</initial>
          </persName>
        </author>
      </analytic>
      <monogr x-scientific-popularization="no" x-international-audience="yes" x-proceedings="yes" x-invited-conference="no" x-editorial-board="yes">
        <title level="m">CVPR 2019 - IEEE Conference on Computer Vision and Pattern Recognition</title>
        <loc>Long Beach, United States</loc>
        <imprint>
          <publisher>
            <orgName>IEEE</orgName>
          </publisher>
          <dateStruct>
            <month>June</month>
            <year>2019</year>
          </dateStruct>
          <biblScope type="pages">1-14</biblScope>
          <ref xlink:href="https://hal.archives-ouvertes.fr/hal-02429093" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>archives-ouvertes.<allowbreak/>fr/<allowbreak/>hal-02429093</ref>
        </imprint>
        <meeting id="cid82398">
          <title>IEEE International Conference on Computer Vision and Pattern Recognition</title>
          <num>2019</num>
          <abbr type="sigle">CVPR</abbr>
        </meeting>
      </monogr>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid0" type="inproceedings" rend="year" n="cite:klokov:hal-02268466">
      <identifiant type="hal" value="hal-02268466"/>
      <analytic>
        <title level="a">Probabilistic Reconstruction Networks for 3D Shape Inference from a Single Image</title>
        <author>
          <persName key="thoth-2018-idp154992">
            <foreName>Roman</foreName>
            <surname>Klokov</surname>
            <initial>R.</initial>
          </persName>
          <persName key="thoth-2018-idp122832">
            <foreName>Jakob</foreName>
            <surname>Verbeek</surname>
            <initial>J.</initial>
          </persName>
          <persName key="morpheo-2018-idp143488">
            <foreName>Edmond</foreName>
            <surname>Boyer</surname>
            <initial>E.</initial>
          </persName>
        </author>
      </analytic>
      <monogr x-scientific-popularization="no" x-international-audience="yes" x-proceedings="yes" x-invited-conference="no" x-editorial-board="yes">
        <title level="m">BMVC 2019 - British Machine Vision Conference</title>
        <loc>Cardiff, United Kingdom</loc>
        <imprint>
          <dateStruct>
            <month>September</month>
            <year>2019</year>
          </dateStruct>
          <biblScope type="pages">1-15</biblScope>
          <ref xlink:href="https://hal.inria.fr/hal-02268466" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>inria.<allowbreak/>fr/<allowbreak/>hal-02268466</ref>
        </imprint>
        <meeting id="cid38519">
          <title>British Machine Vision Conference</title>
          <num>30</num>
          <abbr type="sigle">BMVC</abbr>
        </meeting>
      </monogr>
      <note type="bnote"><ref xlink:href="https://arxiv.org/abs/1908.07475" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>arxiv.<allowbreak/>org/<allowbreak/>abs/<allowbreak/>1908.<allowbreak/>07475</ref> - Awarded with Best Science Paper Honourable Mention Award at BMVC'19.</note>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid24" type="inproceedings" rend="year" n="cite:kulunchakov:hal-02139489">
      <identifiant type="hal" value="hal-02139489"/>
      <analytic>
        <title level="a">A Generic Acceleration Framework for Stochastic Composite Optimization</title>
        <author>
          <persName key="thoth-2018-idp157424">
            <foreName>Andrei</foreName>
            <surname>Kulunchakov</surname>
            <initial>A.</initial>
          </persName>
          <persName key="thoth-2018-idp112112">
            <foreName>Julien</foreName>
            <surname>Mairal</surname>
            <initial>J.</initial>
          </persName>
        </author>
      </analytic>
      <monogr x-scientific-popularization="no" x-international-audience="yes" x-proceedings="yes" x-invited-conference="no" x-editorial-board="yes">
        <title level="m">NeurIPS 2019 - Thirty-third Conference Neural Information Processing Systems</title>
        <loc>Vancouver, Canada</loc>
        <imprint>
          <dateStruct>
            <month>December</month>
            <year>2019</year>
          </dateStruct>
          <biblScope type="pages">1-24</biblScope>
          <ref xlink:href="https://hal.inria.fr/hal-02139489" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>inria.<allowbreak/>fr/<allowbreak/>hal-02139489</ref>
        </imprint>
        <meeting id="cid29560">
          <title>Annual Conference on Neural Information Processing Systems</title>
          <num>33</num>
          <abbr type="sigle">NIPS</abbr>
        </meeting>
      </monogr>
      <note type="bnote">
        <ref xlink:href="https://arxiv.org/abs/1906.01164" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>arxiv.<allowbreak/>org/<allowbreak/>abs/<allowbreak/>1906.<allowbreak/>01164</ref>
      </note>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid12" type="inproceedings" rend="year" n="cite:kulunchakov:hal-02121913">
      <identifiant type="arXiv" value="1901.08788"/>
      <identifiant type="hal" value="hal-02121913"/>
      <analytic>
        <title level="a">Estimate Sequences for Variance-Reduced Stochastic Composite Optimization</title>
        <author>
          <persName key="thoth-2018-idp157424">
            <foreName>Andrei</foreName>
            <surname>Kulunchakov</surname>
            <initial>A.</initial>
          </persName>
          <persName key="thoth-2018-idp112112">
            <foreName>Julien</foreName>
            <surname>Mairal</surname>
            <initial>J.</initial>
          </persName>
        </author>
      </analytic>
      <monogr x-scientific-popularization="no" x-international-audience="yes" x-proceedings="yes" x-invited-conference="no" x-editorial-board="yes">
        <title level="m">ICML 2019 - 36th International Conference on Machine Learning</title>
        <loc>Long Beach, United States</loc>
        <imprint>
          <dateStruct>
            <month>June</month>
            <year>2019</year>
          </dateStruct>
          <biblScope type="pages">1-24</biblScope>
          <ref xlink:href="https://hal.inria.fr/hal-02121913" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>inria.<allowbreak/>fr/<allowbreak/>hal-02121913</ref>
        </imprint>
        <meeting id="cid32516">
          <title>International Conference on Machine Learning</title>
          <num>36</num>
          <abbr type="sigle">ICML</abbr>
        </meeting>
      </monogr>
      <note type="bnote"><ref xlink:href="https://arxiv.org/abs/1905.02374" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>arxiv.<allowbreak/>org/<allowbreak/>abs/<allowbreak/>1905.<allowbreak/>02374</ref> - short version of preprint arXiv:1901.08788</note>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid26" type="inproceedings" rend="year" n="cite:lucas:hal-01886285">
      <identifiant type="hal" value="hal-01886285"/>
      <analytic>
        <title level="a">Adaptive Density Estimation for Generative Models</title>
        <author>
          <persName key="thoth-2018-idp162304">
            <foreName>Thomas</foreName>
            <surname>Lucas</surname>
            <initial>T.</initial>
          </persName>
          <persName key="thoth-2018-idp172032">
            <foreName>Konstantin</foreName>
            <surname>Shmelkov</surname>
            <initial>K.</initial>
          </persName>
          <persName key="thoth-2018-idp115024">
            <foreName>Karteek</foreName>
            <surname>Alahari</surname>
            <initial>K.</initial>
          </persName>
          <persName key="thoth-2018-idp119968">
            <foreName>Cordelia</foreName>
            <surname>Schmid</surname>
            <initial>C.</initial>
          </persName>
          <persName key="thoth-2018-idp122832">
            <foreName>Jakob</foreName>
            <surname>Verbeek</surname>
            <initial>J.</initial>
          </persName>
        </author>
      </analytic>
      <monogr x-scientific-popularization="no" x-international-audience="yes" x-proceedings="yes" x-invited-conference="no" x-editorial-board="yes">
        <title level="m">NeurIPS 2019 - Thirty-third Conference on Neural Information Processing Systems</title>
        <loc>Vancouver, Canada</loc>
        <imprint>
          <dateStruct>
            <month>December</month>
            <year>2019</year>
          </dateStruct>
          <biblScope type="pages">1-24</biblScope>
          <ref xlink:href="https://hal.archives-ouvertes.fr/hal-01886285" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>archives-ouvertes.<allowbreak/>fr/<allowbreak/>hal-01886285</ref>
        </imprint>
        <meeting id="cid29560">
          <title>Annual Conference on Neural Information Processing Systems</title>
          <num>33</num>
          <abbr type="sigle">NIPS</abbr>
        </meeting>
      </monogr>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid6" type="inproceedings" rend="year" n="cite:pashevich:hal-02273326">
      <identifiant type="hal" value="hal-02273326"/>
      <analytic>
        <title level="a">Learning to Augment Synthetic Images for Sim2Real Policy Transfer</title>
        <author>
          <persName key="thoth-2018-idp167168">
            <foreName>Alexander</foreName>
            <surname>Pashevich</surname>
            <initial>A.</initial>
          </persName>
          <persName key="willow-2018-idp154880">
            <foreName>Robin</foreName>
            <surname>Strudel</surname>
            <initial>R.</initial>
          </persName>
          <persName key="willow-2018-idp174496">
            <foreName>Igor</foreName>
            <surname>Kalevatykh</surname>
            <initial>I.</initial>
          </persName>
          <persName key="willow-2018-idp114960">
            <foreName>Ivan</foreName>
            <surname>Laptev</surname>
            <initial>I.</initial>
          </persName>
          <persName key="thoth-2018-idp119968">
            <foreName>Cordelia</foreName>
            <surname>Schmid</surname>
            <initial>C.</initial>
          </persName>
        </author>
      </analytic>
      <monogr x-scientific-popularization="no" x-international-audience="yes" x-proceedings="yes" x-invited-conference="no" x-editorial-board="yes">
        <title level="m">IROS 2019 - IEEE/RSJ International Conference on Intelligent Robots and Systems</title>
        <loc>Macao, China</loc>
        <imprint>
          <dateStruct>
            <month>November</month>
            <year>2019</year>
          </dateStruct>
          <biblScope type="pages">1-6</biblScope>
          <ref xlink:href="https://hal.archives-ouvertes.fr/hal-02273326" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>archives-ouvertes.<allowbreak/>fr/<allowbreak/>hal-02273326</ref>
        </imprint>
        <meeting id="cid93437">
          <title>IEEE RSJ International Conference on Intelligent Robots and Systems</title>
          <num>2019</num>
          <abbr type="sigle">IROS</abbr>
        </meeting>
      </monogr>
      <note type="bnote"><ref xlink:href="https://arxiv.org/abs/1903.07740" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>arxiv.<allowbreak/>org/<allowbreak/>abs/<allowbreak/>1903.<allowbreak/>07740</ref> - 7 pages</note>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid25" type="inproceedings" rend="year" n="cite:peyre:hal-01975760">
      <identifiant type="hal" value="hal-01975760"/>
      <analytic>
        <title level="a">Detecting unseen visual relations using analogies</title>
        <author>
          <persName key="willow-2018-idp147584">
            <foreName>Julia</foreName>
            <surname>Peyre</surname>
            <initial>J.</initial>
          </persName>
          <persName key="willow-2018-idp114960">
            <foreName>Ivan</foreName>
            <surname>Laptev</surname>
            <initial>I.</initial>
          </persName>
          <persName key="thoth-2018-idp119968">
            <foreName>Cordelia</foreName>
            <surname>Schmid</surname>
            <initial>C.</initial>
          </persName>
          <persName key="willow-2018-idp117872">
            <foreName>Josef</foreName>
            <surname>Sivic</surname>
            <initial>J.</initial>
          </persName>
        </author>
      </analytic>
      <monogr x-scientific-popularization="no" x-international-audience="yes" x-proceedings="yes" x-invited-conference="no" x-editorial-board="yes">
        <title level="m">ICCV 2019 - International Conference on Computer Vision</title>
        <loc>Seoul, South Korea</loc>
        <imprint>
          <dateStruct>
            <month>October</month>
            <year>2019</year>
          </dateStruct>
          <ref xlink:href="https://hal.archives-ouvertes.fr/hal-01975760" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>archives-ouvertes.<allowbreak/>fr/<allowbreak/>hal-01975760</ref>
        </imprint>
        <meeting id="cid82250">
          <title>IEEE International Conference on Computer Vision</title>
          <num>2019</num>
          <abbr type="sigle">ICCV</abbr>
        </meeting>
      </monogr>
      <note type="bnote">
        <ref xlink:href="https://arxiv.org/abs/1812.05736v3" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>arxiv.<allowbreak/>org/<allowbreak/>abs/<allowbreak/>1812.<allowbreak/>05736v3</ref>
      </note>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid1" type="inproceedings" rend="year" n="cite:ruiz:hal-01896007">
      <identifiant type="hal" value="hal-01896007"/>
      <analytic>
        <title level="a">Learning Disentangled Representations with Reference-Based Variational Autoencoders</title>
        <author>
          <persName>
            <foreName>Adrià</foreName>
            <surname>Ruiz</surname>
            <initial>A.</initial>
          </persName>
          <persName>
            <foreName>Oriol</foreName>
            <surname>Martinez</surname>
            <initial>O.</initial>
          </persName>
          <persName>
            <foreName>Xavier</foreName>
            <surname>Binefa</surname>
            <initial>X.</initial>
          </persName>
          <persName key="thoth-2018-idp122832">
            <foreName>Jakob</foreName>
            <surname>Verbeek</surname>
            <initial>J.</initial>
          </persName>
        </author>
      </analytic>
      <monogr x-scientific-popularization="no" x-international-audience="yes" x-proceedings="no" x-invited-conference="no" x-editorial-board="yes">
        <title level="m">ICLR workshop on Learning from Limited Labeled Data</title>
        <loc>New Orleans, United States</loc>
        <imprint>
          <dateStruct>
            <month>May</month>
            <year>2019</year>
          </dateStruct>
          <biblScope type="pages">1-17</biblScope>
          <ref xlink:href="https://hal.inria.fr/hal-01896007" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>inria.<allowbreak/>fr/<allowbreak/>hal-01896007</ref>
        </imprint>
        <meeting id="cid624026">
          <title>International Conference on Learning Representations</title>
          <num>7</num>
          <abbr type="sigle">ICLR</abbr>
        </meeting>
      </monogr>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid17" type="inproceedings" rend="year" n="cite:ruiz:hal-02267564">
      <identifiant type="hal" value="hal-02267564"/>
      <analytic>
        <title level="a">Adaptative Inference Cost With Convolutional Neural Mixture Models</title>
        <author>
          <persName>
            <foreName>Adrià</foreName>
            <surname>Ruiz</surname>
            <initial>A.</initial>
          </persName>
          <persName key="thoth-2018-idp122832">
            <foreName>Jakob</foreName>
            <surname>Verbeek</surname>
            <initial>J.</initial>
          </persName>
        </author>
      </analytic>
      <monogr x-scientific-popularization="no" x-international-audience="yes" x-proceedings="yes" x-invited-conference="no" x-editorial-board="yes">
        <title level="m">ICCV 2019 - International Conference on Computer Vision</title>
        <loc>Seoul, South Korea</loc>
        <imprint>
          <dateStruct>
            <month>October</month>
            <year>2019</year>
          </dateStruct>
          <biblScope type="pages">1-12</biblScope>
          <ref xlink:href="https://hal.archives-ouvertes.fr/hal-02267564" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>archives-ouvertes.<allowbreak/>fr/<allowbreak/>hal-02267564</ref>
        </imprint>
        <meeting id="cid82250">
          <title>IEEE International Conference on Computer Vision</title>
          <num>2019</num>
          <abbr type="sigle">ICCV</abbr>
        </meeting>
      </monogr>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid13" type="inproceedings" rend="year" n="cite:sablayrolles:hal-02278902">
      <identifiant type="hal" value="hal-02278902"/>
      <analytic>
        <title level="a">White-box vs Black-box: Bayes Optimal Strategies for Membership Inference</title>
        <author>
          <persName key="thoth-2018-idp169600">
            <foreName>Alexandre</foreName>
            <surname>Sablayrolles</surname>
            <initial>A.</initial>
          </persName>
          <persName>
            <foreName>Matthijs</foreName>
            <surname>Douze</surname>
            <initial>M.</initial>
          </persName>
          <persName key="tau-2018-idp124336">
            <foreName>Yann</foreName>
            <surname>Ollivier</surname>
            <initial>Y.</initial>
          </persName>
          <persName key="thoth-2018-idp119968">
            <foreName>Cordelia</foreName>
            <surname>Schmid</surname>
            <initial>C.</initial>
          </persName>
          <persName>
            <foreName>Hervé</foreName>
            <surname>Jégou</surname>
            <initial>H.</initial>
          </persName>
        </author>
      </analytic>
      <monogr x-scientific-popularization="no" x-international-audience="yes" x-proceedings="yes" x-invited-conference="no" x-editorial-board="yes">
        <title level="m">ICML 2019 - 36th International Conference on Machine Learning</title>
        <loc>Long Beach, United States</loc>
        <imprint>
          <dateStruct>
            <month>June</month>
            <year>2019</year>
          </dateStruct>
          <ref xlink:href="https://hal.inria.fr/hal-02278902" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>inria.<allowbreak/>fr/<allowbreak/>hal-02278902</ref>
        </imprint>
        <meeting id="cid32516">
          <title>International Conference on Machine Learning</title>
          <num>36</num>
          <abbr type="sigle">ICML</abbr>
        </meeting>
      </monogr>
      <note type="bnote">
        <ref xlink:href="https://arxiv.org/abs/1908.11229" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>arxiv.<allowbreak/>org/<allowbreak/>abs/<allowbreak/>1908.<allowbreak/>11229</ref>
      </note>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid3" type="inproceedings" rend="year" n="cite:sablayrolles:hal-02278905">
      <identifiant type="hal" value="hal-02278905"/>
      <analytic>
        <title level="a">Spreading vectors for similarity search</title>
        <author>
          <persName key="thoth-2018-idp169600">
            <foreName>Alexandre</foreName>
            <surname>Sablayrolles</surname>
            <initial>A.</initial>
          </persName>
          <persName>
            <foreName>Matthijs</foreName>
            <surname>Douze</surname>
            <initial>M.</initial>
          </persName>
          <persName key="thoth-2018-idp119968">
            <foreName>Cordelia</foreName>
            <surname>Schmid</surname>
            <initial>C.</initial>
          </persName>
          <persName>
            <foreName>Hervé</foreName>
            <surname>Jégou</surname>
            <initial>H.</initial>
          </persName>
        </author>
      </analytic>
      <monogr x-scientific-popularization="no" x-international-audience="yes" x-proceedings="yes" x-invited-conference="no" x-editorial-board="yes">
        <title level="m">ICLR 2019 - 7th International Conference on Learning Representations</title>
        <loc>New Orleans, United States</loc>
        <imprint>
          <dateStruct>
            <month>May</month>
            <year>2019</year>
          </dateStruct>
          <biblScope type="pages">1-13</biblScope>
          <ref xlink:href="https://hal.inria.fr/hal-02278905" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>inria.<allowbreak/>fr/<allowbreak/>hal-02278905</ref>
        </imprint>
        <meeting id="cid624026">
          <title>International Conference on Learning Representations</title>
          <num>7</num>
          <abbr type="sigle">ICLR</abbr>
        </meeting>
      </monogr>
      <note type="bnote"><ref xlink:href="https://arxiv.org/abs/1806.03198" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>arxiv.<allowbreak/>org/<allowbreak/>abs/<allowbreak/>1806.<allowbreak/>03198</ref> - Published at ICLR 2019</note>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid10" type="inproceedings" rend="year" n="cite:sydorov:hal-02292339">
      <identifiant type="hal" value="hal-02292339"/>
      <analytic>
        <title level="a">Focused Attention for Action Recognition</title>
        <author>
          <persName key="thoth-2018-idp174464">
            <foreName>Vladyslav</foreName>
            <surname>Sydorov</surname>
            <initial>V.</initial>
          </persName>
          <persName key="thoth-2018-idp115024">
            <foreName>Karteek</foreName>
            <surname>Alahari</surname>
            <initial>K.</initial>
          </persName>
          <persName key="thoth-2018-idp119968">
            <foreName>Cordelia</foreName>
            <surname>Schmid</surname>
            <initial>C.</initial>
          </persName>
        </author>
      </analytic>
      <monogr x-scientific-popularization="no" x-international-audience="yes" x-proceedings="yes" x-invited-conference="no" x-editorial-board="yes">
        <title level="m">BMVC 2019 - British Machine Vision Conference</title>
        <loc>Cardiff, United Kingdom</loc>
        <imprint>
          <dateStruct>
            <month>September</month>
            <year>2019</year>
          </dateStruct>
          <biblScope type="pages">1-12</biblScope>
          <ref xlink:href="https://hal.archives-ouvertes.fr/hal-02292339" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>archives-ouvertes.<allowbreak/>fr/<allowbreak/>hal-02292339</ref>
        </imprint>
        <meeting id="cid38519">
          <title>British Machine Vision Conference</title>
          <num>30</num>
          <abbr type="sigle">BMVC</abbr>
        </meeting>
      </monogr>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid16" type="inproceedings" rend="year" n="cite:vladimirova:hal-02177151">
      <identifiant type="doi" value="10.05193"/>
      <identifiant type="hal" value="hal-02177151"/>
      <analytic>
        <title level="a">Understanding Priors in Bayesian Neural Networks at the Unit Level</title>
        <author>
          <persName key="mistis-2018-idp174224">
            <foreName>Mariia</foreName>
            <surname>Vladimirova</surname>
            <initial>M.</initial>
          </persName>
          <persName key="thoth-2018-idp122832">
            <foreName>Jakob</foreName>
            <surname>Verbeek</surname>
            <initial>J.</initial>
          </persName>
          <persName>
            <foreName>Pablo</foreName>
            <surname>Mesejo</surname>
            <initial>P.</initial>
          </persName>
          <persName key="mistis-2018-idp121872">
            <foreName>Julyan</foreName>
            <surname>Arbel</surname>
            <initial>J.</initial>
          </persName>
        </author>
      </analytic>
      <monogr x-scientific-popularization="no" x-international-audience="yes" x-proceedings="yes" x-invited-conference="no" x-editorial-board="yes">
        <title level="m">ICML 2019 - 36th International Conference on Machine Learning</title>
        <loc>Long Beach, United States</loc>
        <title level="s">Proceedings of the 36th International Conference on Machine Learning</title>
        <imprint>
          <biblScope type="volume">97</biblScope>
          <dateStruct>
            <month>June</month>
            <year>2019</year>
          </dateStruct>
          <biblScope type="pages">6458-6467</biblScope>
          <ref xlink:href="https://hal.archives-ouvertes.fr/hal-02177151" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>archives-ouvertes.<allowbreak/>fr/<allowbreak/>hal-02177151</ref>
        </imprint>
        <meeting id="cid32516">
          <title>International Conference on Machine Learning</title>
          <num>36</num>
          <abbr type="sigle">ICML</abbr>
        </meeting>
      </monogr>
      <note type="bnote"><ref xlink:href="https://arxiv.org/abs/1810.05193" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>arxiv.<allowbreak/>org/<allowbreak/>abs/<allowbreak/>1810.<allowbreak/>05193</ref> - 10 pages, 5 figures, ICML'19 conference</note>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid27" type="unpublished" rend="year" n="cite:cheron:hal-01979583">
      <identifiant type="hal" value="hal-01979583"/>
      <monogr>
        <title level="m">Modeling Spatio-Temporal Human Track Structure for Action Localization</title>
        <author>
          <persName key="thoth-2018-idp140400">
            <foreName>Guilhem</foreName>
            <surname>Chéron</surname>
            <initial>G.</initial>
          </persName>
          <persName>
            <foreName>Anton</foreName>
            <surname>Osokin</surname>
            <initial>A.</initial>
          </persName>
          <persName key="willow-2018-idp114960">
            <foreName>Ivan</foreName>
            <surname>Laptev</surname>
            <initial>I.</initial>
          </persName>
          <persName key="thoth-2018-idp119968">
            <foreName>Cordelia</foreName>
            <surname>Schmid</surname>
            <initial>C.</initial>
          </persName>
        </author>
        <imprint>
          <dateStruct>
            <month>January</month>
            <year>2019</year>
          </dateStruct>
          <ref xlink:href="https://hal.inria.fr/hal-01979583" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>inria.<allowbreak/>fr/<allowbreak/>hal-01979583</ref>
        </imprint>
      </monogr>
      <note type="bnote"><ref xlink:href="https://arxiv.org/abs/1806.11008" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>arxiv.<allowbreak/>org/<allowbreak/>abs/<allowbreak/>1806.<allowbreak/>11008</ref> - working paper or preprint</note>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid35" type="unpublished" rend="year" n="cite:iscen:hal-02370212">
      <identifiant type="doi" value="10.00324"/>
      <identifiant type="hal" value="hal-02370212"/>
      <monogr>
        <title level="m">Graph Convolutional Networks for Learning with Few Clean and many Noisy Labels</title>
        <author>
          <persName>
            <foreName>Ahmet</foreName>
            <surname>Iscen</surname>
            <initial>A.</initial>
          </persName>
          <persName>
            <foreName>Giorgos</foreName>
            <surname>Tolias</surname>
            <initial>G.</initial>
          </persName>
          <persName key="linkmedia-2018-idp156096">
            <foreName>Yannis</foreName>
            <surname>Avrithis</surname>
            <initial>Y.</initial>
          </persName>
          <persName>
            <foreName>Ondřej</foreName>
            <surname>Chum</surname>
            <initial>O.</initial>
          </persName>
          <persName key="thoth-2018-idp119968">
            <foreName>Cordelia</foreName>
            <surname>Schmid</surname>
            <initial>C.</initial>
          </persName>
        </author>
        <imprint>
          <dateStruct>
            <month>November</month>
            <year>2019</year>
          </dateStruct>
          <ref xlink:href="https://hal.inria.fr/hal-02370212" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>inria.<allowbreak/>fr/<allowbreak/>hal-02370212</ref>
        </imprint>
      </monogr>
      <note type="bnote"><ref xlink:href="https://arxiv.org/abs/1910.00324" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>arxiv.<allowbreak/>org/<allowbreak/>abs/<allowbreak/>1910.<allowbreak/>00324</ref> - working paper or preprint</note>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid11" type="unpublished" rend="year" n="cite:kulunchakov:hal-01993531">
      <identifiant type="hal" value="hal-01993531"/>
      <monogr>
        <title level="m">Estimate Sequences for Stochastic Composite Optimization: Variance Reduction, Acceleration, and Robustness to Noise</title>
        <author>
          <persName key="thoth-2018-idp157424">
            <foreName>Andrei</foreName>
            <surname>Kulunchakov</surname>
            <initial>A.</initial>
          </persName>
          <persName key="thoth-2018-idp112112">
            <foreName>Julien</foreName>
            <surname>Mairal</surname>
            <initial>J.</initial>
          </persName>
        </author>
        <imprint>
          <dateStruct>
            <month>January</month>
            <year>2019</year>
          </dateStruct>
          <ref xlink:href="https://hal.inria.fr/hal-01993531" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>inria.<allowbreak/>fr/<allowbreak/>hal-01993531</ref>
        </imprint>
      </monogr>
      <note type="bnote"><ref xlink:href="https://arxiv.org/abs/1901.08788" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>arxiv.<allowbreak/>org/<allowbreak/>abs/<allowbreak/>1901.<allowbreak/>08788</ref> - working paper or preprint</note>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid36" type="unpublished" rend="year" n="cite:lecouat:hal-02414291">
      <identifiant type="hal" value="hal-02414291"/>
      <monogr>
        <title level="m">Revisiting Non Local Sparse Models for Image Restoration</title>
        <author>
          <persName key="thoth-2019-idp164256">
            <foreName>Bruno</foreName>
            <surname>Lecouat</surname>
            <initial>B.</initial>
          </persName>
          <persName key="willow-2018-idp112352">
            <foreName>Jean</foreName>
            <surname>Ponce</surname>
            <initial>J.</initial>
          </persName>
          <persName key="thoth-2018-idp112112">
            <foreName>Julien</foreName>
            <surname>Mairal</surname>
            <initial>J.</initial>
          </persName>
        </author>
        <imprint>
          <dateStruct>
            <month>December</month>
            <year>2019</year>
          </dateStruct>
          <ref xlink:href="https://hal.inria.fr/hal-02414291" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>inria.<allowbreak/>fr/<allowbreak/>hal-02414291</ref>
        </imprint>
      </monogr>
      <note type="bnote">working paper or preprint</note>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid8" type="unpublished" rend="year" n="cite:li:hal-02384675">
      <identifiant type="hal" value="hal-02384675"/>
      <monogr>
        <title level="m">Hierarchical Scene Coordinate Classification and Regression for Visual Localization</title>
        <author>
          <persName key="perception-2018-idp151616">
            <foreName>Xiaotian</foreName>
            <surname>Li</surname>
            <initial>X.</initial>
          </persName>
          <persName>
            <foreName>Shuzhe</foreName>
            <surname>Wang</surname>
            <initial>S.</initial>
          </persName>
          <persName>
            <foreName>Yi</foreName>
            <surname>Zhao</surname>
            <initial>Y.</initial>
          </persName>
          <persName key="thoth-2018-idp122832">
            <foreName>Jakob</foreName>
            <surname>Verbeek</surname>
            <initial>J.</initial>
          </persName>
          <persName>
            <foreName>Juho</foreName>
            <surname>Kannala</surname>
            <initial>J.</initial>
          </persName>
        </author>
        <imprint>
          <dateStruct>
            <month>November</month>
            <year>2019</year>
          </dateStruct>
          <ref xlink:href="https://hal.inria.fr/hal-02384675" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>inria.<allowbreak/>fr/<allowbreak/>hal-02384675</ref>
        </imprint>
      </monogr>
      <note type="bnote"><ref xlink:href="https://arxiv.org/abs/1909.06216" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>arxiv.<allowbreak/>org/<allowbreak/>abs/<allowbreak/>1909.<allowbreak/>06216</ref> - working paper or preprint</note>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid37" type="unpublished" rend="year" n="cite:mairal:hal-02417766">
      <identifiant type="hal" value="hal-02417766"/>
      <monogr>
        <title level="m">Cyanure: An Open-Source Toolbox for Empirical Risk Minimization for Python, C++, and soon more</title>
        <author>
          <persName key="thoth-2018-idp112112">
            <foreName>Julien</foreName>
            <surname>Mairal</surname>
            <initial>J.</initial>
          </persName>
        </author>
        <imprint>
          <dateStruct>
            <month>December</month>
            <year>2019</year>
          </dateStruct>
          <ref xlink:href="https://hal.inria.fr/hal-02417766" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>inria.<allowbreak/>fr/<allowbreak/>hal-02417766</ref>
        </imprint>
      </monogr>
      <note type="bnote">working paper or preprint</note>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid34" type="unpublished" rend="year" n="cite:mialon:hal-02395624">
      <identifiant type="hal" value="hal-02395624"/>
      <monogr>
        <title level="m">Screening Data Points in Empirical Risk Minimization via Ellipsoidal Regions and Safe Loss Functions</title>
        <author>
          <persName key="thoth-2018-idp164736">
            <foreName>Grégoire</foreName>
            <surname>Mialon</surname>
            <initial>G.</initial>
          </persName>
          <persName>
            <foreName>Alexandre</foreName>
            <surname>D'Aspremont</surname>
            <initial>A.</initial>
          </persName>
          <persName key="thoth-2018-idp112112">
            <foreName>Julien</foreName>
            <surname>Mairal</surname>
            <initial>J.</initial>
          </persName>
        </author>
        <imprint>
          <dateStruct>
            <month>December</month>
            <year>2019</year>
          </dateStruct>
          <ref xlink:href="https://hal.archives-ouvertes.fr/hal-02395624" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>archives-ouvertes.<allowbreak/>fr/<allowbreak/>hal-02395624</ref>
        </imprint>
      </monogr>
      <note type="bnote">working paper or preprint</note>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid7" type="unpublished" rend="year" n="cite:strudel:hal-02274969">
      <identifiant type="hal" value="hal-02274969"/>
      <monogr>
        <title level="m">Learning to combine primitive skills: A step towards versatile robotic manipulation</title>
        <author>
          <persName key="willow-2018-idp154880">
            <foreName>Robin</foreName>
            <surname>Strudel</surname>
            <initial>R.</initial>
          </persName>
          <persName key="thoth-2018-idp167168">
            <foreName>Alexander</foreName>
            <surname>Pashevich</surname>
            <initial>A.</initial>
          </persName>
          <persName key="willow-2018-idp174496">
            <foreName>Igor</foreName>
            <surname>Kalevatykh</surname>
            <initial>I.</initial>
          </persName>
          <persName key="willow-2018-idp114960">
            <foreName>Ivan</foreName>
            <surname>Laptev</surname>
            <initial>I.</initial>
          </persName>
          <persName key="willow-2018-idp117872">
            <foreName>Josef</foreName>
            <surname>Sivic</surname>
            <initial>J.</initial>
          </persName>
          <persName key="thoth-2018-idp119968">
            <foreName>Cordelia</foreName>
            <surname>Schmid</surname>
            <initial>C.</initial>
          </persName>
        </author>
        <imprint>
          <dateStruct>
            <month>August</month>
            <year>2019</year>
          </dateStruct>
          <ref xlink:href="https://hal.archives-ouvertes.fr/hal-02274969" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>archives-ouvertes.<allowbreak/>fr/<allowbreak/>hal-02274969</ref>
        </imprint>
      </monogr>
      <note type="bnote"><ref xlink:href="https://arxiv.org/abs/1908.00722" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>arxiv.<allowbreak/>org/<allowbreak/>abs/<allowbreak/>1908.<allowbreak/>00722</ref> - 11 pages</note>
    </biblStruct>
    
    <biblStruct id="thoth-2019-bid31" type="unpublished" rend="year" n="cite:varol:hal-02435731">
      <identifiant type="hal" value="hal-02435731"/>
      <monogr>
        <title level="m">Synthetic Humans for Action Recognition from Unseen Viewpoints</title>
        <author>
          <persName key="thoth-2018-idp179328">
            <foreName>Gül</foreName>
            <surname>Varol</surname>
            <initial>G.</initial>
          </persName>
          <persName key="willow-2018-idp114960">
            <foreName>Ivan</foreName>
            <surname>Laptev</surname>
            <initial>I.</initial>
          </persName>
          <persName key="thoth-2018-idp119968">
            <foreName>Cordelia</foreName>
            <surname>Schmid</surname>
            <initial>C.</initial>
          </persName>
          <persName>
            <foreName>Andrew</foreName>
            <surname>Zisserman</surname>
            <initial>A.</initial>
          </persName>
        </author>
        <imprint>
          <dateStruct>
            <month>January</month>
            <year>2020</year>
          </dateStruct>
          <ref xlink:href="https://hal.inria.fr/hal-02435731" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>hal.<allowbreak/>inria.<allowbreak/>fr/<allowbreak/>hal-02435731</ref>
        </imprint>
      </monogr>
      <note type="bnote"><ref xlink:href="https://arxiv.org/abs/1912.04070" location="extern" xlink:type="simple" xlink:show="replace" xlink:actuate="onRequest">https://<allowbreak/>arxiv.<allowbreak/>org/<allowbreak/>abs/<allowbreak/>1912.<allowbreak/>04070</ref> - working paper or preprint</note>
    </biblStruct>
  </biblio>
</raweb>
