{"@context":{"@vocab":"https://cir.nii.ac.jp/schema/1.0/","rdfs":"http://www.w3.org/2000/01/rdf-schema#","dc":"http://purl.org/dc/elements/1.1/","dcterms":"http://purl.org/dc/terms/","foaf":"http://xmlns.com/foaf/0.1/","prism":"http://prismstandard.org/namespaces/basic/2.0/","cinii":"http://ci.nii.ac.jp/ns/1.0/","datacite":"https://schema.datacite.org/meta/kernel-4/","ndl":"http://ndl.go.jp/dcndl/terms/","jpcoar":"https://github.com/JPCOAR/schema/blob/master/2.0/"},"@id":"https://cir.nii.ac.jp/crid/1360021391858844544.json","@type":"Article","productIdentifier":[{"identifier":{"@type":"DOI","@value":"10.1016/j.neucom.2023.126692"}},{"identifier":{"@type":"URI","@value":"https://api.elsevier.com/content/article/PII:S0925231223008159?httpAccept=text/xml"}},{"identifier":{"@type":"URI","@value":"https://api.elsevier.com/content/article/PII:S0925231223008159?httpAccept=text/plain"}},{"identifier":{"@type":"DOI","@value":"10.48550/arxiv.2201.06714"}}],"resourceType":"学術雑誌論文(journal article)","dc:title":[{"@value":"AdaTerm: Adaptive T-distribution estimated robust moments for Noise-Robust stochastic gradient optimization"}],"description":[{"notation":[{"@value":"With the increasing practicality of deep learning applications, practitioners are inevitably faced with datasets corrupted by noise from various sources such as measurement errors, mislabeling, and estimated surrogate inputs/outputs that can adversely impact the optimization results. It is a common practice to improve the optimization algorithm's robustness to noise, since this algorithm is ultimately in charge of updating the network parameters. Previous studies revealed that the first-order moment used in Adam-like stochastic gradient descent optimizers can be modified based on the Student's t-distribution. While this modification led to noise-resistant updates, the other associated statistics remained unchanged, resulting in inconsistencies in the assumed models. In this paper, we propose AdaTerm, a novel approach that incorporates the Student's t-distribution to derive not only the first-order moment but also all the associated statistics. This provides a unified treatment of the optimization process, offering a comprehensive framework under the statistical model of the t-distribution for the first time. The proposed approach offers several advantages over previously proposed approaches, including reduced hyperparameters and improved robustness and adaptability. This noise-adaptive behavior contributes to AdaTerm's exceptional learning performance, as demonstrated through various optimization problems with different and/or unknown noise ratios. Furthermore, we introduce a new technique for deriving a theoretical regret bound without relying on AMSGrad, providing a valuable contribution to the field"}]},{"notation":[{"@value":"27 pages; Final version accepted by Elsevier Neurocomputing Journal (2023-08; https://doi.org/10.1016/j.neucom.2023.126692)"}]}],"creator":[{"@id":"https://cir.nii.ac.jp/crid/1380021391858844567","@type":"Researcher","foaf:name":[{"@value":"Takamitsu Matsubara"}]},{"@id":"https://cir.nii.ac.jp/crid/1420001326228879872","@type":"Researcher","personIdentifier":[{"@type":"KAKEN_RESEARCHERS","@value":"10796452"},{"@type":"NRID","@value":"1000010796452"},{"@type":"NRID","@value":"9000299561684"},{"@type":"NRID","@value":"9000412372852"},{"@type":"NRID","@value":"9000331421946"},{"@type":"NRID","@value":"9000414369522"},{"@type":"NRID","@value":"9000405878919"},{"@type":"NRID","@value":"9000291596511"},{"@type":"NRID","@value":"9000308050815"},{"@type":"NRID","@value":"9000331421965"},{"@type":"NRID","@value":"9000248231751"},{"@type":"NRID","@value":"9000375892659"},{"@type":"NRID","@value":"9000405879592"},{"@type":"NRID","@value":"9000392525900"},{"@type":"NRID","@value":"9000309159272"},{"@type":"NRID","@value":"9000404786379"},{"@type":"NRID","@value":"9000399363389"},{"@type":"NRID","@value":"9000390922546"},{"@type":"NRID","@value":"9000411116288"},{"@type":"NRID","@value":"9000399364722"},{"@type":"RESEARCHMAP","@value":"https://researchmap.jp/kbys-t"}],"foaf:name":[{"@value":"Taisuke Kobayashi"}]},{"@id":"https://cir.nii.ac.jp/crid/1380021391858844557","@type":"Researcher","foaf:name":[{"@value":"Wendyam Eric Lionel Ilboudo"}]}],"publication":{"publicationIdentifier":[{"@type":"PISSN","@value":"09252312"}],"prism:publicationName":[{"@value":"Neurocomputing"}],"dc:publisher":[{"@value":"Elsevier BV"}],"prism:publicationDate":"2023-11","prism:volume":"557","prism:startingPage":"126692"},"reviewed":"false","dc:rights":["https://www.elsevier.com/tdm/userlicense/1.0/","https://www.elsevier.com/legal/tdmrep-license","http://www.elsevier.com/open-access/userlicense/1.0/","https://doi.org/10.15223/policy-017","https://doi.org/10.15223/policy-037","https://doi.org/10.15223/policy-012","https://doi.org/10.15223/policy-029","https://doi.org/10.15223/policy-004"],"url":[{"@id":"https://api.elsevier.com/content/article/PII:S0925231223008159?httpAccept=text/xml"},{"@id":"https://api.elsevier.com/content/article/PII:S0925231223008159?httpAccept=text/plain"}],"createdAt":"2023-08-22","modifiedAt":"2025-10-13","foaf:topic":[{"@id":"https://cir.nii.ac.jp/all?q=FOS:%20Computer%20and%20information%20sciences","dc:title":"FOS: Computer and information sciences"},{"@id":"https://cir.nii.ac.jp/all?q=Computer%20Science%20-%20Machine%20Learning","dc:title":"Computer Science - Machine Learning"},{"@id":"https://cir.nii.ac.jp/all?q=Machine%20Learning%20(cs.LG)","dc:title":"Machine Learning (cs.LG)"}],"project":[{"@id":"https://cir.nii.ac.jp/crid/1040285300696560512","@type":"Project","projectIdentifier":[{"@type":"KAKEN","@value":"20H04265"},{"@type":"JGN","@value":"JP20H04265"},{"@type":"URI","@value":"https://kaken.nii.ac.jp/grant/KAKENHI-PROJECT-20H04265/"}],"notation":[{"@language":"ja","@value":"ヒトとの物理的接触モデルを紐解く深層学習の開発と安全なロボット制御への応用"},{"@language":"en","@value":"Development of deep learning to reveal physical human-robot interaction and its application to safe robot control"}]}],"relatedProduct":[{"@id":"https://cir.nii.ac.jp/crid/1050282677439916928","@type":"Article","resourceType":"学術雑誌論文(journal article)","relationType":["references"],"jpcoar:relatedTitle":[{"@language":"en","@value":"Sigmoid-weighted linear units for neural network function approximation in reinforcement learning"}]},{"@id":"https://cir.nii.ac.jp/crid/1050295834376528512","@type":"Article","resourceType":"学術雑誌論文(journal article)","relationType":["references"],"jpcoar:relatedTitle":[{"@language":"en","@value":"q-VAE for Disentangled Representation Learning and Latent Dynamical Systems"}]},{"@id":"https://cir.nii.ac.jp/crid/1360009142928678016","@type":"Article","resourceType":"学術雑誌論文(journal article)","relationType":["references"],"jpcoar:relatedTitle":[{"@value":"t-soft update of target network for deep reinforcement learning"}]},{"@id":"https://cir.nii.ac.jp/crid/1360292620522120448","@type":"Article","relationType":["references"],"jpcoar:relatedTitle":[{"@value":"Mirror descent and nonlinear projected subgradient methods for convex optimization"}]},{"@id":"https://cir.nii.ac.jp/crid/1360292621117052672","@type":"Article","relationType":["references"],"jpcoar:relatedTitle":[{"@value":"Robust Stochastic Gradient Descent With Student-t Distribution Based First-Order Momentum"}]},{"@id":"https://cir.nii.ac.jp/crid/1360298762193267328","@type":"Article","relationType":["references"],"jpcoar:relatedTitle":[{"@value":"A Survey of Optimization Methods From a Machine Learning Perspective"}]},{"@id":"https://cir.nii.ac.jp/crid/1360572092372443904","@type":"Article","resourceType":"学術雑誌論文(journal article)","relationType":["references"],"jpcoar:relatedTitle":[{"@value":"Student-t policy in reinforcement learning to acquire global optimum of robot control"}]},{"@id":"https://cir.nii.ac.jp/crid/1360579820055099520","@type":"Article","relationType":["references"],"jpcoar:relatedTitle":[{"@value":"Knowledge Distillation: A Survey"}]},{"@id":"https://cir.nii.ac.jp/crid/1360584343259320704","@type":"Article","relationType":["references"],"jpcoar:relatedTitle":[{"@value":"HyAdamC: A New Adam-Based Hybrid Optimization Algorithm for Convolution Neural Networks"}]},{"@id":"https://cir.nii.ac.jp/crid/1360584346297675776","@type":"Article","relationType":["references"],"jpcoar:relatedTitle":[{"@value":"Efficient learning with robust gradient descent"}]},{"@id":"https://cir.nii.ac.jp/crid/1360853567636215168","@type":"Article","resourceType":"学術雑誌論文(journal article)","relationType":["references"],"jpcoar:relatedTitle":[{"@value":"Deep Learning-Based Average Consensus"}]},{"@id":"https://cir.nii.ac.jp/crid/1360855567865740544","@type":"Article","relationType":["references"],"jpcoar:relatedTitle":[{"@value":"Deep Residual Learning for Image Recognition"}]},{"@id":"https://cir.nii.ac.jp/crid/1360865817582315392","@type":"Article","relationType":["references"],"jpcoar:relatedTitle":[{"@value":"On classifier behavior in the presence of mislabeling noise"}]},{"@id":"https://cir.nii.ac.jp/crid/1360865817582689536","@type":"Article","relationType":["references"],"jpcoar:relatedTitle":[{"@value":"On the approximation of functions by tanh neural networks"}]},{"@id":"https://cir.nii.ac.jp/crid/1360865821451723136","@type":"Article","relationType":["references"],"jpcoar:relatedTitle":[{"@value":"Position-Transitional Particle Swarm Optimization-Incorporated Latent Factor Analysis"}]},{"@id":"https://cir.nii.ac.jp/crid/1360865821455638656","@type":"Article","relationType":["references"],"jpcoar:relatedTitle":[{"@value":"Robust Estimation via Robust Gradient Estimation"}]},{"@id":"https://cir.nii.ac.jp/crid/1361418519657311872","@type":"Article","relationType":["references"],"jpcoar:relatedTitle":[{"@value":"Rethinking the Inception Architecture for Computer Vision"}]},{"@id":"https://cir.nii.ac.jp/crid/1362262946383268736","@type":"Article","relationType":["references"],"jpcoar:relatedTitle":[{"@value":"A Stochastic Approximation Method"}]},{"@id":"https://cir.nii.ac.jp/crid/1363951794115240960","@type":"Article","relationType":["references"],"jpcoar:relatedTitle":[{"@value":"Deep learning"}]},{"@id":"https://cir.nii.ac.jp/crid/1370021391858844416","@type":"Product","relationType":["references"],"jpcoar:relatedTitle":[{"@value":"A tail-index analysis of stochastic gradient noise in deep neural networks"}]},{"@id":"https://cir.nii.ac.jp/crid/1370021391858844420","@type":"Product","relationType":["references"],"jpcoar:relatedTitle":[{"@value":"Descending through a crowded valley-benchmarking deep learning optimizers"}]},{"@id":"https://cir.nii.ac.jp/crid/1370021391858844423","@type":"Product","relationType":["references"],"jpcoar:relatedTitle":[{"@value":"Towards theoretically understanding why sgd generalizes better than adam in deep learning"}]},{"@id":"https://cir.nii.ac.jp/crid/1370021391858844544","@type":"Product","relationType":["references"],"jpcoar:relatedTitle":[{"@value":"Understanding and improving layer normalization"}]},{"@id":"https://cir.nii.ac.jp/crid/1370021391858844545","@type":"Product","relationType":["references"],"jpcoar:relatedTitle":[{"@value":"A framework for behavioural cloning"}]},{"@id":"https://cir.nii.ac.jp/crid/1370021391858844547","@type":"Product","relationType":["references"],"jpcoar:relatedTitle":[{"@value":"A new regret analysis for adam-type algorithms"}]},{"@id":"https://cir.nii.ac.jp/crid/1370021391858844548","@type":"Product","relationType":["references"],"jpcoar:relatedTitle":[{"@value":"Understanding and improving convolutional neural networks via concatenated rectified linear units"}]},{"@id":"https://cir.nii.ac.jp/crid/1370021391858844550","@type":"Product","relationType":["references"],"jpcoar:relatedTitle":[{"@value":"EasyLabel: A semi-automatic pixel-wise object annotation tool for creating robotic RGB-D datasets"}]},{"@id":"https://cir.nii.ac.jp/crid/1370021391858844553","@type":"Product","relationType":["references"],"jpcoar:relatedTitle":[{"@value":"Does label smoothing mitigate label noise?"}]},{"@id":"https://cir.nii.ac.jp/crid/1370021391858844557","@type":"Product","relationType":["references"],"jpcoar:relatedTitle":[{"@value":"Truncated backpropagation through time and Kalman filter training for neurocontrol"}]},{"@id":"https://cir.nii.ac.jp/crid/1370021391858844558","@type":"Product","relationType":["references"],"jpcoar:relatedTitle":[{"@value":"Neural ordinary differential equations"}]},{"@id":"https://cir.nii.ac.jp/crid/1370021391858844560","@type":"Product","relationType":["references"],"jpcoar:relatedTitle":[{"@value":"Adaptive t-momentum-based optimization for unknown ratio of outliers in amateur data in imitation learning"}]},{"@id":"https://cir.nii.ac.jp/crid/1370021391858844561","@type":"Product","relationType":["references"],"jpcoar:relatedTitle":[{"@value":"AdaBelief optimizer: Adapting stepsizes by the belief in observed gradients"}]},{"@id":"https://cir.nii.ac.jp/crid/1370021391858844562","@type":"Product","relationType":["references"],"jpcoar:relatedTitle":[{"@value":"A novel parameter estimation algorithm for the multivariate t-distribution and its application to computer vision"}]},{"@id":"https://cir.nii.ac.jp/crid/1370021391858844563","@type":"Product","relationType":["references"],"jpcoar:relatedTitle":[{"@value":"A robust adaptive stochastic gradient method for deep learning"}]},{"@id":"https://cir.nii.ac.jp/crid/1370021391858844564","@type":"Product","relationType":["references"],"jpcoar:relatedTitle":[{"@value":"Robustness analysis of non-convex stochastic gradient descent using biased expectations"}]},{"@id":"https://cir.nii.ac.jp/crid/1370021391858844569","@type":"Product","relationType":["references"],"jpcoar:relatedTitle":[{"@value":"Proximal policy optimization with adaptive threshold for symmetric relative density ratio"}]},{"@id":"https://cir.nii.ac.jp/crid/1370021391858844570","@type":"Product","relationType":["references"],"jpcoar:relatedTitle":[{"@value":"Automatic differentiation in PyTorch"}]},{"@id":"https://cir.nii.ac.jp/crid/1370021391858844572","@type":"Product","relationType":["references"],"jpcoar:relatedTitle":[{"@value":"Stochastic optimization with heavy-tailed noise via accelerated gradient clipping"}]}],"dataSourceIdentifier":[{"@type":"CROSSREF","@value":"10.1016/j.neucom.2023.126692"},{"@type":"KAKEN","@value":"PRODUCT-25234037"},{"@type":"OPENAIRE","@value":"doi_dedup___::7c4a58cdfd92c7a50649cd149f517003"}]}