@inproceedings{faucris.325534240, abstract = {Dialogue Enhancement (DE) enables the rebalancing of dialogue and background sounds to fit personal preferences and needs in the context of broadcast audio. When individual audio stems are unavailable from production, Dialogue Separation (DS) can be applied to the final audio mixture to obtain esti-mates of these stems. This work focuses on Preferred Loudness Differences (PLDs) between dialogue and background sounds. While previous studies determined the PLD through a listening test employing original stems from production, stems estimated by DS are used in the present study. In addition, a larger variety of signal classes is considered. PLDs vary substantially across individuals (average interquartile range: 5.7 LU). Despite this variability, PLDs are found to be highly dependent on the signal type under consideration, and it is shown that median PLDs can be predicted using objective intelligibility metrics. Two existing baseline prediction methods - intended for use with original stems - displayed a Mean Absolute Error (MAE) of 7.5 LU and 5 LU, respectively. A modified baseline (MAE: 3.2 LU) and an alternative approach (MAE: 2.5 LU) are proposed. Results support the viability of processing final broadcast mixtures with DS and offering an alternative remixing that accounts for median PLDs.
@inproceedings{faucris.325533737, abstract = {This paper proposes SEFGAN, a Deep Neural Network (DNN) combining maximum likelihood training and Generative Adversarial Networks (GANs) for efficient speech enhancement (SE). For this, a DNN is trained to synthesize the enhanced speech conditioned on noisy speech using a Normalizing Flow (NF) as generator in a GAN framework. While the combination of likelihood models and GANs is not trivial, SEFGAN demonstrates that a hybrid adversarial and maximum likelihood training approach enables the model to maintain high quality audio generation and log-likelihood estimation. Our experiments indicate that this approach strongly outperforms the baseline NF-based model without introducing additional complexity to the enhancement network. A comparison using computational metrics and a listening experiment reveals that SEFGAN is competitive with other state-of-the-art models.
@inproceedings{faucris.312698410, abstract = {A common problem when coding transient signals in transform audio codecs is the temporal unmasking of quantization noise leading to artifacts such as pre-echoes. In this work, a new method for temporal shaping of quantization noise is introduced. Subband merging/splitting with overlapping windows is applied on the MDCT (Modified Discrete Cosine Transform) spectrum in the encoder and decoder. This allows direct modification of the temporal subband signals for different frequency ranges in a way that results in a flattening of the temporal envelopes in the encoder before quantization and a reshaping to their original envelope at the decoder. The reshaping process at the decoder not only restores the envelopes of the original subband signals, but also shapes the quantization noise accordingly, which results in quantization noise temporally shaped similarly to the original subband signal. This avoids temporal unmasking of quantization noise and the pre-echo artifact. For the temporal flattening/reshaping process the application of gains transmitted via Autoregressive model as well as a low side information approach via companding are proposed and evaluated regarding their noise shaping capabilities in an experimental codec system.}, author = {Füg, Richard and Edler, Bernd}, booktitle = {IEEE Workshop on Applications of Signal Processing to Audio and Acoustics}, date = {2023-10-22/2023-10-25}, doi = {10.1109/WASPAA58266.2023.10248079}, faupublication = {yes}, isbn = {9798350323726}, keywords = {audio coding; mdct; subband merging/splitting; temporal noise shaping (tns)}, note = {CRIS-Team Scopus Importer:2023-10-13}, peerreviewed = {unknown}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, title = {{Temporal} {Noise} {Shaping} on {MDCT} {Subband} {Signals} for {Transform} {Audio} {Coding}}, venue = {New Paltz, NY, USA}, volume = {2023-October}, year = {2023} }
@inproceedings{faucris.289296932, abstract = {Deep generative models for Speech Enhancement (SE) received increasing attention in recent years. The most prominent example are Generative Adversarial Networks (GANs), while normalizing flows (NF) received less attention despite their potential. Building on previous work, architectural modifications are proposed, along with an investigation of different conditional input representations. Despite being a common choice in related works, Mel-spectrograms demonstrate to be inadequate for the given scenario. Alternatively, a novel All-Pole Gammatone filterbank (APG) with high temporal resolution is proposed. Although computational evaluation metric results would suggest that state-of-the-art GAN-based methods perform best, a perceptual evaluation via a listening test indicates that the presented NF approach (based on time domain and APG) performs best, especially at lower SNRs. On average, APG outputs are rated as having good quality, which is unmatched by the other methods, including GAN.}, author = {Strauß, Martin and Torcoli, Matteo and Edler, Bernd}, booktitle = {2022 IEEE Spoken Language Technology Workshop, SLT 2022 - Proceedings}, date = {2023-01-09/2023-01-12}, doi = {10.1109/SLT54892.2023.10022898}, faupublication = {yes}, isbn = {9798350396904}, keywords = {all-pole gammatone filterbank; DNN; normalizing flows; speech enhancement}, note = {CRIS-Team Scopus Importer:2023-02-17}, pages = {444-450}, peerreviewed = {unknown}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, title = {{Improved} {Normalizing} {Flow}-{Based} {Speech} {Enhancement} {Using} an all-{Pole} {Gammatone} {Filterbank} for {Conditional} {Input} {Representation}}, venue = {Doha}, year = {2023} }
@inproceedings{faucris.307587980, abstract = {Since 2017, monthly 3D audio recordings of a nature preserve capture the acoustic environment over seasons and years. The recordings are made at the same location and using the same recording equipment, capturing one hour before and after sunset. The recordings, annotated with real-Time weather data and manually labeled for acoustic events, are made to understand if and how a natural soundscape evolves over time allowing for data-driven speculation about transformations of the soundscape that might be caused by climate change. After a short description of the general project and its current state, methods and results of algorithmic analysis used are presented and the results are discussed. Further methods of collecting additional data and expanded analyses of the body of data are suggested.}, author = {Poepel, Cornelius and Finger, Katja and Peters, Nils and Edler, Bernd}, booktitle = {ACM International Conference Proceeding Series}, date = {2022-09-06/2022-09-09}, doi = {10.1145/3561212.3561217}, faupublication = {yes}, isbn = {9781450397018}, keywords = {bioacoustic; ecoacoustics; signal analysis; soundscape}, note = {Created from Fastlane, Scopus look-up}, pages = {84-87}, peerreviewed = {unknown}, publisher = {Association for Computing Machinery}, title = {{Exploring} a {Long}-{Term} {Dataset} of {Nature} {Reserve} {Ambisonics} {Recordings}}, venue = {St. Pölten}, year = {2022} }
@inproceedings{faucris.287477126, address = {NEW YORK}, author = {Gupta, Kishan and Korse, Srikanth and Edler, Bernd and Fuchs, Guillaume}, booktitle = {2022 IEEE INTERNATIONAL CONFERENCE ON ACOUSTICS, SPEECH AND SIGNAL PROCESSING (ICASSP)}, doi = {10.1109/ICASSP43922.2022.9747410}, faupublication = {yes}, month = {Jan}, note = {CRIS-Team WoS Importer:2023-01-13}, pages = {836-840}, peerreviewed = {unknown}, publisher = {IEEE}, title = {{A} {DNN} {Based} {Post}-{Filter} to {Enhance} the {Quality} of {Coded} {Speech} in {MDCT} {Domain}}, venue = {Singapore}, year = {2022} }
@inproceedings{faucris.282434270, address = {BAIXAS}, author = {Strauß, Martin and Paulus, Jouni and Torcoli, Matteo and Edler, Bernd}, booktitle = {INTERSPEECH 2021}, doi = {10.21437/Interspeech.2021-1418}, faupublication = {yes}, month = {Jan}, note = {CRIS-Team WoS Importer:2022-09-30}, pages = {3900-3904}, peerreviewed = {unknown}, publisher = {ISCA-INT SPEECH COMMUNICATION ASSOC}, title = {{A} {Hands}-on {Comparison} of {DNNs} for {Dialog} {Separation} {Using} {Transfer} {Learning} from {Music} {Source} {Separation}}, venue = {Brno}, year = {2021} }
@inproceedings{faucris.266499116, address = {NEW YORK}, author = {Strauß, Martin and Edler, Bernd}, booktitle = {2021 IEEE INTERNATIONAL CONFERENCE ON ACOUSTICS, SPEECH AND SIGNAL PROCESSING (ICASSP 2021)}, doi = {10.1109/ICASSP39728.2021.9413999}, faupublication = {yes}, month = {Jan}, note = {CRIS-Team WoS Importer:2021-11-26}, pages = {5754-5758}, peerreviewed = {unknown}, publisher = {IEEE}, title = {{A} {FLOW}-{BASED} {NEURAL} {NETWORK} {FOR} {TIME} {DOMAIN} {SPEECH} {ENHANCEMENT}}, venue = {Toronto, ON}, year = {2021} }
@inproceedings{faucris.259927668, abstract = {The possible fields of application for small sensor nodes are tremendous and still growing fast. Concepts like the Internet of Things (IoT), Smart City or Industry 4.0 adopt wireless sensor networks for environmental interaction or metering purposes. As they commonly operate in license-exempt frequency bands, telemetry transmissions of sensors are subject to strong interferences and possible shadowing. Especially in the scope of Low Power Wide Area (LPWA) communications, this scenario results in high computational effort and complexity for the receiver side to perceive the signals of interest. Therefore, this paper investigates means to an adequate segmentation of receive spectra for a partial spectrum exchange between base stations of telemetry-based IoT sensor networks. The distinct interchange of in-phase and quadrature (IQ) data could facilitate stream combining techniques to mask out interferences amongst other approaches. This shall improve decoding rates even under severe operation conditions and simultaneously limit the required data volume. We refer to this approach of a reception network as Edge-RAN (Random Access Network). To cope with the high data rates and still enable a base station collaboration, especially in wirelessly connected receiver mesh networks, different filter bank techniques and block transforms are examined, to divide telemetry spectra into distinct frequency sub-channels. Operational constraints for the spectral decomposition are given and different filter methodologies are introduced. Finally, suitable metrics are established. These metrics shall assess the performance of the presented spectrum segmentation schemes for the purpose of a selective partial interchange between sensor network receivers.}, author = {Schadhauser, Michael and Robert, Jörg and Heuberger, Albert and Edler, Bernd}, booktitle = {2021 IEEE International IOT, Electronics and Mechatronics Conference, IEMTRONICS 2021 - Proceedings}, date = {2021-04-21/2021-04-24}, doi = {10.1109/IEMTRONICS52119.2021.9422584}, editor = {Satyajit Chakrabarti, Rajashree Paul, Bob Gill, Malay Gangopadhyay, Sanghamitra Poddar}, faupublication = {yes}, isbn = {9781665440677}, keywords = {Edge-RAN decoding; IoT; Long range telemetry; Low Power Wide Area (LPWA) communications; Spectrum segmentation; Wireless sensor network}, note = {CRIS-Team Scopus Importer:2021-06-11}, peerreviewed = {unknown}, publisher = {Institute of Electrical and Electronics Engineers Inc.}, title = {{Spectrum} segmentation techniques for edge-{RAN} decoding in telemetry-based {IoT} networks}, venue = {Toronto, ON}, year = {2021} }
@article{faucris.258619963, abstract = {In this paper we propose a long-term prediction
method for low delay transform domain general audio coders.
This Frequency Domain Joint Harmonics Prediction (FDJHP)
method operates directly in the Modified Discrete Cosine Transform
(MDCT) domain and can enhance the coding efficiency,
even under very low frequency resolutions. We compare this
new method with state-of-the-art MDCT based methods by
analyzing bitrate savings and by a listening test using test signals
with strong harmonic components. The results indicate that it
outperforms an existing method, which also directly operates
in the frequency domain. Additionally, we show how it can be
combined with the existing techniques into an adaptive system,
where the different methods can complement each other.
@inproceedings{faucris.248081847, abstract = {A blind bandwidth extension is presented which improves the perceived quality of 4 kHz speech by artificially extending the speech's frequency range to 8 kHz. Based on the source-filter model of the human speech production, the speech signal is decomposed into spectral envelope and excitation signal and each of them is extrapolated separately. With this decomposition, good perceptual quality can be achieved while keeping the computational complexity low. The focus of this work is in the generation of an excitation signal with and autoregressive model that calculates a distribution for each audio sample conditioned on previous samples. This is achieved with a deep neural network following the architecture of LPCNet [1]. A listening test shows that it significantly improves the perceived quality of bandlimited speech. The system has an algorithmic delay of 30 ms and can be applied in state-of-the-art speech and audio codecs.}, author = {Schmidt, Konstantin and Edler, Bernd}, booktitle = {European Signal Processing Conference}, date = {2020-08-24/2020-08-28}, doi = {10.23919/Eusipco47968.2020.9287465}, faupublication = {yes}, isbn = {9789082797053}, keywords = {Artificial bandwidth expansion; Audio super resolution; Bandwidth extension; Speech enhancement; Speech super resolution}, month = {Jan}, note = {CRIS-Team Scopus Importer:2021-01-22}, pages = {426-430}, peerreviewed = {unknown}, publisher = {European Signal Processing Conference, EUSIPCO}, title = {{Blind} bandwidth extension of speech based on {LPCNet}}, venue = {Amsterdam}, volume = {2021-January}, year = {2021} }
@misc{faucris.251691071, author = {Werner, Nils and Edler, Bernd}, faupublication = {yes}, keywords = {\#nosource,mypublication}, peerreviewed = {automatic}, title = {{Time}-{Varying} {Time}-{Frequency} {Tilings} {Using} {Non}-{Uniform} {Orthogonal} {Filterbanks} {Based} on {MDCT} {Analysis}/{Synthesis} and {TDAR}}, year = {2020} }
@misc{faucris.251690578, author = {Werner, Nils and Edler, Bernd and Disch, Sascha}, faupublication = {yes}, keywords = {audio signal,bins,mypublication,samples,set,subband}, peerreviewed = {automatic}, title = {{Perceptual} audio coding with adaptive non-uniform time/frequency tiling using subband merging and the time domain aliasing reduction}, year = {2020} }
@inproceedings{faucris.230327088, abstract = {Up to today telephone speech is still limited to the range of 200 to 3400 Hz since the predominant codecs in public switched telephone networks are AMR-NB, G.711 and G.722 [1, 2, 3]. Blind bandwidth extension (blind BWE, BBWE) can improve the perceived quality as well as the intelligibility of coded speech without changing the transmission network or the speech codec. The BBWE used in this work is based on deep neural networks (DNNs) and has already shown good performance [4]. Although this BBWE enhances the speech without producing too many artifacts it sometimes fails to enhance prominent fricatives which can result in muffled speech. In order to better synthesize prominent fricatives the BBWE is extended by sending a single bit of side information - here referred to as guided BWE. This bit may be transmitted e.g. by watermarking so that no changes to the transmission network or the speech codec have to be done. Different DNN configurations (including convolutional (Conv.) layers as well as long short-term memory layers (LSTM)) making use of this bit have been evaluated. The BBWE has a low computational complexity and an algorithmic delay of 12 ms only and can be applied in state-of-the-art speech and audio codecs.}, author = {Schmidt, Konstantin and Edler, Bernd}, booktitle = {147th Audio Engineering Society International Convention 2019}, date = {2019-10-16/2019-10-19}, faupublication = {yes}, note = {CRIS-Team Scopus Importer:2019-12-10}, peerreviewed = {unknown}, publisher = {Audio Engineering Society}, title = {{Deep} neural network based guided speech bandwidth extension}, venue = {New York, NY}, year = {2019} }
@article{faucris.229741815, abstract = {Time Domain Aliasing Reduction (TDAR) is a method to improve the impulse response compactness of non-uniform orthogonal Modified Discrete Cosine Transforms (MDCT). Previously, TDAR was only possible between frames of identical time-frequency tilings, however in this letter we describe a method to overcome this limitation. This method enables the use of TDAR between two consecutive frames of different time-frequency tilings by introducing another subband merging or subband splitting step. Consecutively, this method allows more flexible and adaptive filterbank tilings while retaining compact impulse responses, two attributes needed for efficient perceptual audio coding.}, author = {Werner, Nils and Edler, Bernd}, doi = {10.1109/LSP.2019.2949433}, faupublication = {yes}, journal = {IEEE Signal Processing Letters}, keywords = {MDCT; perceptual coding; TDAC; TDAR; time-frequency transform}, note = {CRIS-Team Scopus Importer:2019-11-26}, pages = {1783-1787}, peerreviewed = {Yes}, title = {{Time}-{Varying} {Time}-{Frequency} {Tilings} {Using} {Non}-{Uniform} {Orthogonal} {Filterbanks} {Based} on {MDCT} {Analysis}/{Synthesis} and {Time} {Domain} {Aliasing} {Reduction}}, volume = {26}, year = {2019} }
@inproceedings{faucris.227084764, abstract = {Masking models that evaluate the audibility of error signals have a limited validity for assessing perceptual quality of parametric codecs. We propose a model that transforms the audio signal into an Internal Representation (IR) consisting of temporal-envelope modulation patterns. Subsequently, the IR of original and encoded signals are compared between both signals. Even though the audio signals compared may be uncorrelated, leading to a large error signal, they may exhibit a very similar IR and hence are predicted to sound very similar. Additional post-processing stages modeling higher-level auditory perceptual phenomena such as Comodulation Masking Release are included. Predictions are compared against subjective quality assessment results obtained with encoding methods ranging from parametric processing methods up to classic waveform preserving codecs.
@inproceedings{faucris.227084517, address = {Dublin, Ireland}, author = {Werner, Nils and Edler, Bernd}, booktitle = {146th AES Convention}, date = {2019-03-20/2019-03-23}, faupublication = {yes}, peerreviewed = {unknown}, title = {{Computational} {Complexity} of a {Cascaded} {Nonuniform} {Orthogonal} {Lapped} {Filterbank} {Based} on {MDCT} and {Time} {Domain} {Aliasing} {Reduction}}, venue = {Dublin}, year = {2019} }
@inproceedings{faucris.227084270, address = {Dublin, Ireland}, author = {Werner, Nils and Edler, Bernd}, booktitle = {146th AES Convention}, date = {2019-03-20/2019-03-23}, faupublication = {yes}, peerreviewed = {unknown}, title = {{Experimenting} with {Lapped} {Transforms} in {Numerical} {Computation} {Libraries} using {Polyphase} {Matrices} and {Strided} {Memory} {Views}}, venue = {Dublin}, year = {2019} }
@inproceedings{faucris.227084022, author = {Werner, Nils and Edler, Bernd}, booktitle = {2019 IEEE International Conference on Acoustics, Speech and Signal Processing}, date = {2019-05-12/2019-05-17}, doi = {10.1109/ICASSP.2019.8683502}, faupublication = {yes}, peerreviewed = {unknown}, title = {{Perceptual} {Audio} {Coding} with {Adaptive} {Non}-{Uniform} {Time}/{Frequency} {Tilings} using {Subband} {Merging} and {Time} {Domain} {Aliasing} {Reduction}}, venue = {Brighton}, year = {2019} }
@article{faucris.212123140, author = {Stöter, Fabian-Robert and Chakrabarty, Soumitro and Edler, Bernd and Habets, Emanuël}, doi = {10.1109/TASLP.2018.2877892}, faupublication = {yes}, journal = {IEEE/ACM Transactions on Audio, Speech and Language Processing}, pages = {268--282}, peerreviewed = {Yes}, title = {{CountNet}: {Estimating} the number of concurrent speakers using supervised learning}, volume = {27}, year = {2019} }
@misc{faucris.251690825, author = {Werner, Nils and Edler, Bernd}, faupublication = {yes}, keywords = {audio signal,block,mypublication,samples,set,subband}, peerreviewed = {automatic}, title = {{Time} domain aliasing reduction for non-uniform filterbanks which use spectral analysis followed by partial synthesis}, year = {2018} }
@article{faucris.227085502, abstract = {For a long time, many popular listening test methods, such as ITU-R BS.1534 (MUSHRA), could not be carried out as web-based listening tests, since established web standards did not support all required audio processing features. With the standardization of the Web Audio API, the required features became available and, therefore, also the possibility to implement a wide range of established methods as web-based listening tests. In order to simplify the implementation of MUSHRA listening tests, the development of webMUSHRA was started. By utilizing webMUSHRA, experimenters can configure web-based MUSHRA listening tests without the need of web programming expertise. Today, webMUSHRA supports many more listening test methods, such as ITU-R BS.1116 and forced-choice procedures. Moreover, webMUSHRA is highly customizable and has been used in many auditory studies for different purpose}, author = {Schöffler, Michael and Bartoschek, Sarah and Stöter, Fabian-Robert and Röß, Marlene and Westphal, Susanne and Edler, Bernd and Herre, Jürgen}, doi = {10.5334/jors.187}, faupublication = {yes}, journal = {Journal of Open Research Software}, keywords = {web-based, listening tests, auditory experiments, MUSHRA, Web Audio API}, note = {herre{\_}books{\_}journals}, pages = {8}, peerreviewed = {Yes}, title = {{webMUSHRA} - {A} {Comprehensive} {Framework} for {Web}-based {Listening} {Tests}}, url = {https://openresearchsoftware.metajnl.com/articles/10.5334/jors.187/}, volume = {6 (1)}, year = {2018} }
@inproceedings{faucris.227085256, author = {Stöter, Fabian-Robert and Chakrabarty, Soumitro and Edler, Bernd and Habets, Emanuël}, doi = {10.1109/icassp.2018.8462159}, faupublication = {yes}, peerreviewed = {unknown}, title = {{Classification} vs. regression in supervised learning for speaker count estimation}, venue = {Calgary, Alberta}, year = {2018} }
@inproceedings{faucris.227085008, abstract = {Since early perceptual audio coders such as mp3, the underlying psychoacoustic model that controls the encoding process has not undergone many dramatic changes. Meanwhile, modern audio coders have been equipped with semi-parametric or parametric coding tools such as audio bandwidth extension. Thereby, the initial psychoacoustic model used in a perceptual coder, just considering added quantization noise, became partly unsuitable. We propose the use of an improved psychoacoustic excitation model based on an existing model proposed by Dau et al. in 1997. This modulation-based model is essentially independent from the input waveform by calculating an internal auditory representation. Using the example of MPEG-H 3D Audio and its semi-parametric Intelligent Gap Filling (IGF) tool, we demonstrate that we can successfully control the IGF parameter selection process to achieve overall improved perceptual quality.
@inproceedings{faucris.212120706, address = {Calgary, Canada}, author = {Stöter, Fabian-Robert and Chakrabarty, Soumitro and Edler, Bernd and Habets, Emanuël}, booktitle = {Proc. IEEE Intl. Conf. on Acoustics, Speech and Signal Processing (ICASSP)}, doi = {10.1109/ICASSP.2018.8462159}, faupublication = {yes}, pages = {436--440}, peerreviewed = {unknown}, title = {{Classification} vs. regression in supervised learning for single channel speaker count estimation}, year = {2018} }
@inproceedings{faucris.212118008, author = {Mack, Wolfgang and Chakrabarty, Soumitro and Stöter, Fabian-Robert and Braun, Sebastian and Edler, Bernd and Habets, Emanuël}, booktitle = {Proc. Interspeech Conf.}, doi = {10.21437/Interspeech.2018-1296}, faupublication = {yes}, pages = {1314--1318}, peerreviewed = {unknown}, title = {{Single}-channel dereverberation using direct {MMSE} optimization and bidirectional {LSTM} networks}, year = {2018} }
@article{faucris.227085751, author = {Werner, Nils and Edler, Bernd}, doi = {10.1109/LSP.2017.2678023}, faupublication = {yes}, journal = {IEEE Signal Processing Letters}, keywords = {Encoding;Indexes;Merging;Signal resolution;Time-domain analysis;Time-frequency analysis;Transforms;Modified discrete cosine transform (MDCT);perceptual coding;time-domain aliasing reduction (TDAR);time-frequency transform}, pages = {589-593}, peerreviewed = {Yes}, title = {{Nonuniform} {Orthogonal} {Filterbanks} {Based} on {MDCT} {Analysis}/{Synthesis} and {Time}-{Domain} {Aliasing} {Reduction}}, volume = {24}, year = {2017} }
@inproceedings{faucris.224244342, address = {London, UK}, author = {Werner, Nils and Balke, Stefan and Stöter, Fabian-Robert and Müller, Meinard and Edler, Bernd}, booktitle = {Proceedings of the Web Audio Conference (WAC)}, faupublication = {yes}, peerreviewed = {unknown}, title = {trackswitch.js: {A} {Versatile} {Web}-{Based} {Audio} {Player} for {Presenting} {Scientific} {Results}}, year = {2017} }
@inproceedings{faucris.227087307, abstract = {Contemporary perceptual audio coders, all of which apply the modified discrete cosine transform (MDCT), with an overlap ratio of 50%, for frequency-domain quantization, provide good coding quality even at low bit-rates. However, relatively long frames are required for acceptable low-rate performance also for quasi-stationary harmonic input, leading to increased algorithmic latency and reduced temporal coding resolution. This paper investigates the alternative approach of employing the extended lapped transform (ELT), with 75% overlap ratio, on such input. To maintain a high time resolution for coding of transient segments, the ELT definition is modified such that frame-wise switching between ELT (for quasi-stationary) and MDCT coding (for non-stationary or non-tonal regions), with complete time-domain aliasing cancelation and no increase in frame length, becomes possible. A new ELT window function with improved side-lobe rejection to avoid framing artifacts is also derived. Blind subjective evaluation of the switched-ratio proposal confirms the benefit of the signal-adaptive design.
@article{faucris.227087058, abstract = {Perceptual audio coding schemes typically apply the modified discrete cosine transform (MDCT) with different lengths and windows, and utilize signal-adaptive switching between these on a perframe basis for best subjective performance. In previous papers, the authors demonstrated that further quality gains can be achieved for some input signals using additional transform kernels such as the modified discrete sine transform (MDST) or greater inter-transform overlap by means of a modified extended lapped transform (MELT). This work discusses the algorithmic procedures and codec modifications necessary to combine all of the above features-transform length, window shape, transform kernel, and overlap ratio switching-into a flexible input-adaptive coding system. It is shown that, due to full time-domain aliasing cancelation, this system supports perfect signal reconstruction in the absence of quantization and, thanks to fast realizations of all transforms, increases the codec complexity only negligibly. The results of a 5.1 multichannel listening test are also reported.
@inproceedings{faucris.227086812, abstract = {Intelligent Gap Filling (IGF) denotes a semi-parametric coding technique within modern codecs like MPEG-H-3D-Audio or the 3gpp-EVS-codec. IGF can be applied to fill spectral holes introduced by the quantization process in the encoder due to low-bitrate constraints. Typically, if the limited bit budget does not allow for transparent coding, spectral holes emerge in the high-frequency (HF) region of the signal first and increasingly affect the entire upper spectral range for lowest bitrates. At the decoder side, such spectral holes are substituted via IGF using synthetic HF content generated out of low-frequency (LF) content, and post-processing controlled by additional parametric side information. This paper provides an overview of the principles and functionalities of IGF and presents listening test data assessing the perceptual quality of IGF coded audio material.
@inproceedings{faucris.227086562, abstract = {In perceptual audio coders, the audio signal masks the quantization noise. The masking effectiveness depends on the degree of tonality/noisiness of the signal. Hence, in psychoacoustic models (PM) of perceptual coders, the level of the estimated masking thresholds can be adjusted by tonality estimation methods. This paper introduces three envelope analysis methods for tonality estimation: optimized amplitude modulation ratio (AM-R), auditory image correlation, and temporal envelope rate. The methods were implemented in a filter bank-based PM. In a subjective quality test, they were compared to each other and to another existing method, partial spectral flatness measure (PSFM). The PSFM and the AM-R were rated significantly higher than the other methods.
@article{faucris.227086261, abstract = {This paper examines how masked thresholds depend on the masker bandwidth and center frequency when the masker has a smaller bandwidth than the signal. The signal bandwidth was equal to the equivalent rectangular bandwidth of the auditory filter and the masker bandwidth was 0.1, 0.35, or 0.6 times the signal bandwidth. The masker and signal were centered at the same frequency of 257, 697, 1538, 3142, or 6930 Hz. Masked thresholds were estimated using a two-interval two-alternative forced-choice paradigm and a three-down one-up adaptive staircase method. Masked thresholds increased with increasing masker bandwidth and were lowest for medium center frequencies.
@article{faucris.227086016, abstract = {Perceptual audio coders exploit the masking properties of the human auditory system to reduce the bit rate in audio recording and transmission systems; it is intended that the quantization noise is just masked by the audio signal. The effectiveness of the audio signal as a masker depends on whether it is tone-like or noise-like. The determination of this, both physically and perceptually, depends on the duration of the stimuli. To gather information that might improve the efficiency of perceptual coders, the duration required to distinguish between a narrowband noise and a tone was measured as a function of center frequency and noise bandwidth. In experiment 1, duration thresholds were measured for isolated noise and tone bursts. In experiment 2, duration thresholds were measured for tone and noise segments embedded within longer tone pulses.
@inproceedings{faucris.302104020, abstract = {Listening tests are widely used to assess the quality of audio systems. The majority of such listening tests is conducted in controlled environments with selected participants and professional audio equipment. In the last few years, conducting listening tests over the Internet, as so called web-based experiments, has become popular. A recent study has shown that web-based experiments lead to comparable results as laboratory experiments. Until now, it was only possible to implement a limited number of listening test types as web-based experiments because web standards were missing some crucial features, e. g. sample manipulation of audio streams. With the upcoming of the Web Audio API, a much wider range of listening test types can be implemented as new audio processing features have been introduced. This paper demonstrates which new possibilities are enabled by the Web Audio API. To this end, the ITU-R Recommendation BS.1534 (MUSHRA) is taken as an example.
@inproceedings{faucris.227088299, abstract = {Algorithms for estimating the fundamental frequency (F0) of a signal vary in stability and accuracy. We propose a method which iteratively improves the estimates of such algorithms by applying in each step a time warp on the input signal based on the previously estimated fundamental frequency. This time warp is designed to lead to a nearly constant F0. A refine ment is then calculated through inverse time warping of the result of an F0 estimation applied to the warped signal. The proposed refinement algorithm is not limited to specific esti mators or optimized for specific input signal characteristics. The method is evaluated on synthetic audio signals as well as speech recordings and polyphonic music recordings. Results indicate a significant improvement on accuracy when using the proposed refinement in combination with several well-known F0 estimators.
@inproceedings{faucris.227088050, abstract = {Traditional audio codecs based on real-valued transforms utilize separate and largely independent algorithmic schemes for parametric coding of noise-like or high-frequency spectral components as well as channel pairs. It is shown that in the frequency-domain part of coders such as Extended HE-AAC, these schemes can be unified into a single algorithmic block located at the core of the modified discrete cosine transform path, enabling greater flexibility like semi-parametric coding and large savings in codec delay and complexity. This paper focuses on the stereo coding aspect of this block and demonstrates that, by using specially chosen spectral configurations when deriving the parametric side-information in the encoder, perceptual artifacts can be reduced and the spatial processing in the decoder can remain real-valued. Listening tests confirm the benefit of our proposal at intermediate bit-rates.
@inproceedings{faucris.227087804, abstract = {Modern stereo and multi-channel perceptual audio codecs utilizing the modified discrete cosine transform (MDCT) can achieve very good overall coding quality even at low bit-rates but lack efficiency on some material with inter-channel phase difference (IPD) of about ±90 degrees. To address this issue a generalization of the lapped transform coding scheme is proposed which retains the perfect reconstruction property while allowing the usage of three further transform kernels, one of which is the modified discrete sine transform (MDST). Blind listening tests indicate that by frame-wise adaptation of each channel's transform kernel to the instantaneous IPD characteristics, notable gains in coding quality are possible with only negligible increase in decoder complexity and parameter rate.
@inproceedings{faucris.227087555, abstract = {Estimating the fundamental frequency (F0) of a signal is a well studied task in audio signal processing with many applications. If the F0 varies over time, the complexity increases, and it is also more difficult to provide ground truth data for evaluation. In this paper we present a novel dataset of cello recordings addressing the lack of reference annotations for musical instruments. Besides audio data, we include sensor recordings capturing the finger position on the fingerboard which is converted into an instantaneous frequency estimate. In speech processing, the electroglottograph (EGG) is able to capture the excitation signal of the vocal tract, which is then used to generate a reference instantaneous F0. Inspired by this approach, we included high speed video camera recordings to extract the excitation signal originating from the moving string. The derived data can be used to analyze vibratos --- a very commonly used playing style. The dataset is released under a Creative Commons license.
@inproceedings{faucris.306042164, author = {Herre, Jürgen and Edler, Bernd and Disch, Sascha}, booktitle = {17th International Conference on Digital Audio Effects DAFx-14}, faupublication = {yes}, note = {Invited Conference Workshop Presentations, Tutorials}, peerreviewed = {unknown}, title = {{Tutorial} on {Perceptual} {Audio} {Coding}}, venue = {Erlangen}, year = {2014} }
@inproceedings{faucris.227135516, abstract = {General-purpose MDCT-based audio coders like MP3 or HE-AAC utilize long inter-transform overlap and lookahead-based transform length switching to provide good coding quality for both stationary and non-stationary, i. e. transient, input signals even at low bitrates. In low-delay communication scenarios such as Voice over IP, however, algorithmic delay due to framing and overlap typically needs to be reduced and additional lookahead must be avoided. We show that these restrictions limit the performance of contemporary low-delay transform coders on either stationary or transient material and propose 3 modifications: an improved noise substitution technique and increased overlap between “long”transforms for stationary, and “long to short” transform length switching without lookahead and directly from the long overlap for transient frames. A listening test indicates the merit of these changes when integrated into AAC-LD.
@inproceedings{faucris.227135274, abstract = {}, author = {Stöter, Fabian-Robert and Bayer, Stefan and Edler, Bernd}, booktitle = {17th International Conference on Digital Audio Effects (DAFx-14)}, faupublication = {yes}, peerreviewed = {unknown}, title = {{Unison} {Source} {Separation}}, url = {http://dafx14.fau.de/papers/dafx14{\_}fabian-robert{\_}stoter{\_}unison{\_}source{\_}separation.pdf}, year = {2014} }In this work we present a new scenario of analyzing and separating linear mixtures of musical instrument signals. When instruments are playing in unison, traditional source separation methods are not performing well. Although the sources share the same pitch, they often still differ in their modulation frequency caused by vibrato and/or tremolo effects. In this paper we propose source separation schemes that exploit AM/FM characteristics to improve the sep- aration quality of such mixtures. We show a method to process mixtures based on differences in their amplitude modulation fre- quency of the sources by using non-negative tensor factorization. Further, we propose an informed warped time domain approach for separating mixtures based on variations in the instantaneous frequencies of the sources.
@inproceedings{faucris.227135031, abstract = {Psychoacoustic studies show that the strength of masking is, among others, dependent on the tonality of the masker: the effect of noise maskers is stronger than that of tone maskers. Recently, a Partial Spectral Flatness Measure (PSFM) was introduced for tonality estimation in a psychoacoustic model for perceptual audio coding. The model consists of an Infinite Impulse Response (IIR) filterbank which considers the spreading effect of individual local maskers in simultaneous masking. An optimized (with respect to audio quality and computational efficiency) PSFM is now compared to a similar psychoacoustic model with prediction based tonality estimation in medium (48 kbit/s) and low (32 kbit/s) bit rate conditions (mono) via subjective quality tests. 15 expert listeners participated in the subjective tests. The results are depicted and discussed. Additionally, we conducted the subjective tests with 15 non-expert consumers whose results are also shown and compared to those of the experts.
@inproceedings{faucris.227134789, author = {Taghipour, Armin and Guo, Shujie and Edler, Bernd}, faupublication = {yes}, peerreviewed = {unknown}, title = {{Thresholds} for perception of rapid changes in tone bursts}, year = {2014} }
@inproceedings{faucris.227134545, abstract = {Perceptual audio codecs apply psychoacoustic principles such as masking effects of the human auditory system in order to reduce irrelevancies in the input audio signal. Psychoacoustic studies show differences between masking strength of tonal and noise maskers: the masking effect of narrowband noise is stronger than that of a tone which has the same power and is placed in the center frequency of the noise. In this paper, two tonality estimation methods are discussed which are implemented in a filter bank based psychoacoustic model. The first method is called Partial Spectral Flatness Measure (PSFM) and the second is referred to as Amplitude Modulation Ratio (AM-R). The psychoacoustic model uses a set of complex band-pass filters. It was designed according to the temporal/spectral resolution of the human auditory system, and takes into account post masking as well as the spreading effect of individual local maskers in simultaneous masking. This paper describes the model, tonality estimation methods and their implementation. The estimators are compared to each other by subjective tests. The results are presented and discussed.
@inproceedings{faucris.306451482, author = {Schöffler, Michael and Edler, Bernd and Herre, Jürgen}, booktitle = {Proceedings of the 10th International Symposium on Computer Music Multidisciplinary Research}, date = {2013-10-15/2013-10-18}, faupublication = {yes}, note = {herre{\_}papers{\_}subjective{\_}audio{\_}quality}, pages = {Pages 678 - 693}, peerreviewed = {unknown}, title = {{How} {Much} {Does} {Audio} {Quality} {Influence} {Ratings} of {Overall} {Listening} {Experience}?}, venue = {Marseille}, year = {2013} }
@inproceedings{faucris.227136982, abstract = {The Time-Warped Modified Discrete Cosine Transform (TW-MDCT) improves the energy compaction for harmonic signals with varying fundamental frequency compared to the plain MDCT. Adaptive context based entropy coding has the potential to provide higher gain over memoryless entropy coding. But in combination with the TW-MDCT, the context based adaptive coding may lead to suboptimal coding. This paper presents an algorithm for improving the context for the TW-MDCT. This is mainly achieved by exploiting already available information on the frequency variation needed by the TW-MDCT. This results in an improved entropy coding.
@inproceedings{faucris.227136738, abstract = {Modern transform audio coders often employ parametric enhancements, like noise substitution or bandwidth extension. In addition to these well-known parametric tools, it might also be desirable to synthesize parametric sinusoidal tones in the decoder. Low computational complexity is an important criterion in codec development and essential for acceptance and deployment. Therefore, efficient ways of generating these tones are needed. Since contemporary codecs like AAC or USAC are based on an MDCT domain representation of audio, we propose to generate synthetic tones by patching tone patterns into the MDCT spectrum at the decoder. We demonstrate how appropriate spectral patterns can be derived and adapted to their target location in (and between) the MDCT time/frequency (t/f) grid to seamlessly synthesize high quality sinusoidal tones including sweeps.
@inproceedings{faucris.227136494, author = {Taghipour, Armin and Edler, Bernd and Amirpour, Masoumeh and Herre, Jürgen}, booktitle = {Proc. Meetings Acous. (ICA)}, doi = {10.1121/1.4799876}, faupublication = {yes}, peerreviewed = {unknown}, title = {{Dependency} of tonality perception on frequency, bandwidth, and duration}, url = {http://www.ica2013montreal.org/Proceedings/mss/050040{\_}1.pdf}, volume = {19}, year = {2013} }
@inproceedings{faucris.227136250, author = {Schöffler, Michael and Stöter, Fabian-Robert and Bayerlein, Harald and Edler, Bernd and Herre, Jürgen}, booktitle = {Proceedings of the 14th International Society for Music Information Retrieval Conference ({ISMIR})}, editor = {Souza Britto Jr. A, Gouyon F, Dixon S}, faupublication = {yes}, note = {herre{\_}papers{\_}subjective{\_}audio{\_}quality}, pages = {389-394}, peerreviewed = {unknown}, title = {{An} {Experiment} {About} {Estimating} the {Number} of {Instruments} in {Polyphonic} {Music}: {A} {Comparison} {Between} {Internet} and {Laboratory} {Results}}, venue = {Curitiba}, year = {2013} }
@inproceedings{faucris.227136002, author = {Stöter, Fabian-Robert and Schöffler, Michael and Edler, Bernd and Herre, Jürgen}, booktitle = {Proceedings of Meetings on Acoustics}, doi = {10.1121/1.4799609}, faupublication = {yes}, note = {herre{\_}papers{\_}subjective{\_}audio{\_}quality}, pages = {035034 (9 pages)}, peerreviewed = {unknown}, title = {{Human} ability of counting the number of instruments in polyphonic music}, url = {http://scitation.aip.org/content/asa/journal/poma/19/1/10.1121/1.4799609}, volume = {19}, year = {2013} }
@inproceedings{faucris.227135760, abstract = {Perceptual audio codecs use psychoacoustic models for irrelevancy reduction by exploiting masking effects in the human auditory system. In masking, the tonality of the masker plays an important role and therefore should be evaluated in the psychoacoustic model. In this study a partial Spectral Flatness Measure (SFM) is applied to a filter bank-based psychoacoustic model to estimate tonality. The Infinite Impulse Response (IIR) band-pass filters are designed to take into account the spreading in simultaneous masking. Tonality estimation is adapted to temporal and spectral resolution of the auditory system. Employing subjective audio coding preference tests, the Partial SFM is compared with prediction-based tonality estimation.
@inproceedings{faucris.227137226, author = {Taghipour, Armin and Edler, Bernd and Amirpour, Masoumeh and Herre, Jürgen}, booktitle = {133rd Convention of the Audio Engineering Society}, faupublication = {yes}, peerreviewed = {unknown}, title = {{Thresholds} for the discrimination of tonal and narrowband noise bursts}, url = {http://www.aes.org/e-lib/browse.cfm?elib=16481}, year = {2012} }
@inproceedings{faucris.227137955, abstract = {Modern music production often uses pre-recorded pieces of audio, so-called samples, taken from a huge sample database. Consequently, there is an increasing demand to extensively adapt these samples to their intended new musical environment in a flexible way. Such an application, for instance, retroactively changes the key mode of audio recordings, e.g. from a major key to minor key by a frequency selective transposition of pitch. Recently, the modulation vocoder (MODVOC) has been proposed to handle this task. In this paper, two enhancements to the MODVOC are presented and the subjective quality of its application to selective pitch transposition is assessed. Moreover, the proposed scheme is compared with results obtained by applying a commercial computer program, which became newly available on the market. The proposed method is clearly preferred in terms of the perceptual quality aspect "melody and chords transposition", while the commercial program is favored by the majority with regard to the aspect "timbre preservation".
@inproceedings{faucris.227137710, author = {Helmrich, Christian and Carlsson, Pontus and Disch, Sascha and Edler, Bernd and Hilpert, Johannes and Neusinger, Matthias and Purnhagen, Heiko and Rettelbach, Nikolaus and Robilliard, Julien and Villemoes, Lars F.}, booktitle = {Proceedings of the {IEEE} 2011 Int. Conference on Acoustics, Speech and Signal Processing ({ICASSP})}, doi = {10.1109/ICASSP.2011.5946449}, faupublication = {yes}, isbn = {978-1-4577-0537-3}, keywords = {Decoding; Transforms; Complexity theory; Audio coding; Transform coding; Compaction; Audio coding; MDCT; M/S stereo; prediction}, pages = {497-500}, peerreviewed = {unknown}, title = {{Efficient} {Transform} {Coding} of {Two}-{Channel} {Audio} {Signals} by {Means} of {Complex}-{Valued} {Stereo} {Prediction}}, venue = {Prague, Czech Republic}, year = {2011} }
@inproceedings{faucris.227137468, abstract = {Current video coding techniques use a Discrete Cosine Transform (DCT) to reduce spatial correlations within the motion estimation residual. Often the correlation cannot be completely eliminated leaving the transform coefficients statistically dependent. The presented paper proposes a method to predict these coefficients on a block level by using the distribution of the prediction error variance to improve coding efficiency. First experiments lead to a reduction in bit rate by 1.83% when compared to the standard JM 17.2 implementation results.
@inproceedings{faucris.227138200, abstract = {In this paper four combinations of perceptual models and transform coding systems are implemented and compared. The first of the two perceptual models is based on a DFT with a uniform frequency resolution. The second model uses IIR filters designed in accordance with the temporal/spectral resolution of the auditory system. Both of the two transform coding systems use a uniform spectral decomposition (MDCT). While in the first system the quantizers are directly controlled by the perceptual model, the second system uses a pre- and post-filter with frequency warping for shaping the quantization noise with a temporal/spectral resolution more adapted to the auditory system. Implementation details are given and results of subjective tests are presented.
@article{faucris.227138442, abstract = {In contemporary cochlear implant systems, the audio signal is decomposed into different frequency bands, each assigned to one electrode. Thus, pitch perception is limited by the number of physical electrodes implanted into the cochlea and by the wide bandwidth assigned to each electrode. The Harmony HiResolution bionic ear (Advanced Bionics LLC, Valencia, CA, USA) has the capability of creating virtual spectral channels through simultaneous delivery of current to pairs of adjacent electrodes. By steering the locus of stimulation to sites between the electrodes, additional pitch percepts can be generated. Two new sound processing strategies based on current steering have been designed, SpecRes and SineEx. In a chronic trial, speech intelligibility, pitch perception, and subjective appreciation of sound were compared between the two current steering strategies and standard HiRes strategy in 9 adult Harmony users. There was considerable variability in benefit, and the mean results show similar performance with all three strategies. © 2009 Waldo Nogueira et al.}, author = {Nogueira, Waldo and Litvak, Leonid and Edler, Bernd and Ostermann, Jörn and Büchner, Andreas}, doi = {10.1155/2009/531213}, faupublication = {yes}, journal = {EURASIP Journal on Advances in Signal Processing}, note = {UnivIS-Import:2015-04-20:Pub.2009.tech.IE.aulab.zentr.signal}, pages = {20}, peerreviewed = {Yes}, title = {{Signal} {Processing} {Strategies} for {Cochlear} {Implants} {Using} {Current} {Steering}}, volume = {2009}, year = {2009} }
@inproceedings{faucris.214432138, abstract = {The modified discrete cosine transform (MDCT) is often used for audio coding due to its critical sampling property and good energy compaction, especially for harmonic tones with constant fundamental frequencies (pitch). However, in voiced human speech the pitch is time-varying and thus the energy is spread over several transform coefficients, leading to a reduction of coding efficiency. The approach presented herein compensates for pitch variation in each MDCT block by application of time-variant re-sampling. A dedicated signal adaptive transform window computation ensures the preservation of the time domain aliasing cancelation (TDAC) property. Re-sampling can be designed such that the duration of the processed blocks is not altered, facilitating the replacement of the conventional MDCT in existing audio coders.}, address = {Munich, Germany}, author = {Edler, Bernd and Disch, Sascha and Bayer, Stefan and Fuchs, Guillaume and Geiger, Ralf}, booktitle = {126th AES Convention}, faupublication = {yes}, isbn = {9781615671663}, note = {UnivIS-Import:2019-03-25:Pub.2009.tech.IE.aulab.zentr.atimew}, pages = {588-595}, peerreviewed = {unknown}, publisher = {126th AES Convention}, title = {{A} time-warped {MDCT} approach to speech transform coding}, volume = {2}, year = {2009} }
@incollection{faucris.111320924, abstract = {Modern music production and sound generation often relies on manipulation of pre-recorded pieces of audio, so-called samples, taken from a huge database. Consequently, there is a increasing request to extensively adapt these samples to any new musical context in a flexible way. For this purpose, advanced digital signal processing is needed in order to realize audio effects like pitch shifting, time stretching or harmonization. Often, a key part of these processing methods is a signal adaptive, block based spectral segmentation operation. Hence, we propose a novel algorithm for such a spectral segmentation based on local centers of gravity (COG). The method was originally developed as part of a multiband modulation decomposition for audio signals. Nevertheless, this algorithm can also be used in the more general context of improved vocoder related applications.}, address = {Como, Italy}, author = {Disch, Sascha and Edler, Bernd}, booktitle = {12th International Conference on Digital Audio Effects}, faupublication = {yes}, note = {UnivIS-Import:2017-12-18:Pub.2009.tech.IE.aulab.zentr.aniter}, pages = {65-70}, peerreviewed = {unknown}, publisher = {DAFx-09}, title = {{An} iterative segmentation algorithm for audio signal spectra depending on estimated local centers of gravity}, year = {2009} }
@inproceedings{faucris.109141604, abstract = {The decomposition of audio signals into perceptually meaningful multiband modulation components opens up new possibilities for advanced signal processing. The signal adaptive analysis approach proposed in this paper will be shown to provide a powerful handle on the signal's perceptual properties: pitch, timbre or roughness can be manipulated straight forward. Additionally a synthesis method is specified providing high subjective perceptual quality. Furthermore, as an application example, a novel audio processing technique is proposed which changes the key mode of a given piece of music e.g. from major to minor key or vice versa. }, address = {Taipei, Taiwan}, author = {Disch, Sascha and Edler, Bernd}, booktitle = {ICASSP International Conference on Acoustics, Speech and Signal Processing}, doi = {10.1109/ICASSP.2009.4960081}, faupublication = {yes}, isbn = {9781424423545}, keywords = {Amplitude modulation; Audio coding; Frequency modulation; Time-frequency analysis}, note = {UnivIS-Import:2017-12-18:Pub.2009.tech.IE.aulab.zentr.multib}, pages = {2305-2308}, peerreviewed = {unknown}, publisher = {IEEE}, title = {{Multiband} perceptual modulation analysis, processing and synthesis of audio signals}, year = {2009} }
@article{faucris.112655004, abstract = {Traditionally, sound codecs have been developed with a particular application in mind, their performance being optimized for specific types of input signals, such as speech or audio (music), and application constraints, such as low bit rate, high quality, or low delay. There is, however, an increasing need for more generic sound codecs, created by the emergence of heterogeneous networks and the convergence of communication and entertainment devices. To obtain such versatility, this study employs hybrid sound coding based on operational rate-distortion (RD) optimization principles. Applying this concept, a prototype coder has been implemented with emphasis on (dynamic) adaptation to the input and to application constraints. With this prototype, listening tests have been performed for different application scenarios. The results demonstrate the versatility of the concept while keeping competitive sound quality compared to dedicated state-of-the-art codecs.}, author = {van Schijndel, Nicolle H. and Bensa, Julien and Christensen, Mads Graesboll and Colomes, Catherine and Edler, Bernd and Heusdens, Richard R. and Jensen, Jesper Højvang and Jensen, Søren Holdt and Kleijn, B. and Kot, Valery S. and Kövesi, Balázs and Lindblom, Jonas and Massaloux, Dominique and Niamut, Omar Aziz and Nordén, Fredrik and Plasberg, Jan H. and Vafin, Renat and van de Par, Steven and Virette, David and Wübbolt, Oliver}, faupublication = {yes}, journal = {Journal of the Audio Engineering Society}, note = {UnivIS-Import:2015-04-20:Pub.2008.tech.IE.aulab.zentr.adapti}, pages = {787-809}, peerreviewed = {unknown}, title = {{Adaptive} {RD} {Optimized} {Hybrid} {Sound} {Coding}}, volume = {56}, year = {2008} }
@incollection{faucris.111376584, abstract = {The decomposition of audio signals into perceptually meaningful modulation components is highly desirable for the development of new audio effects on the one hand and as a building block for future efficient audio compression algorithms on the other hand. In the past, there has always been a distinction between parametric coding methods and waveform coding: While waveform coding methods scale easily up to transparency (provided the necessary bit rate is available), parametric coding schemes are subjected to the limitations of the underlying source models. Otherwise, parametric methods usually offer a wealth of manipulation possibilities which can be exploited for application of audio effects, while waveform coding is strictly limited to the best as possible reproduction of the original signal. The analysis/synthesis approach presented in this paper is an attempt to show a way to bridge this gap by enabling a seamless transition between both approaches.}, address = {Espoo, Finland}, author = {Disch, Sascha and Edler, Bernd}, booktitle = {11th International Conference on Digital Audio Effects}, faupublication = {yes}, isbn = {9789512295173}, note = {UnivIS-Import:2017-12-18:Pub.2008.tech.IE.aulab.zentr.anampl}, pages = {257-263}, peerreviewed = {unknown}, publisher = {DAFx-08}, title = {{An} {Amplitude}- and {Frequency} {Modulation} {Vocoder} for {Audio} {Signal} {Processing}}, year = {2008} }
@inproceedings{faucris.227139898, abstract = {Efficient combinations of coding and manipulation of audio signals in the spectral domain are often desirable in communication systems. The modified discrete cosine transform (MDCT) represents a popular spectral transform in audio coding as it leads to compact signal representations. However, as the MDCT corresponds to a critically sampled filter bank, it is in general not appropriate to directly apply it to filtering tasks. In this paper we present a method to compensate for aliasing terms that arise from such direct MDCT domain filtering. The discussion is thereby based on a rigorous matrix representation of critically sampled filter banks which also leads to corresponding efficient realizations. As an application showcase, noise reduction for MDCT based speech coding is considered in simulations.
@inproceedings{faucris.203736286, address = {Granlibakken, USA}, author = {Nogueira, Waldo and Brendel, Martina and Edler, Bernd and Büchner, Andreas}, booktitle = {2007 Conference on Implantable Auditory Prostheses}, faupublication = {yes}, note = {UnivIS-Import:2018-09-06:Pub.2007.tech.IE.aulab.zentr.anovel}, pages = {-}, peerreviewed = {unknown}, publisher = {Conference on Implantable Auditory Prostheses}, title = {{A} {Novel} {Signal} {Processing} {Strategy} for {Current} {Steering} in {Cochlear} {Implants}}, year = {2007} }
@inproceedings{faucris.115906824, abstract = {Today, cochlear implants (CIs) are the treatment of choice in patients with profound hearing loss. However speech intelligibility with these devices is still limited. A factor that determines hearing performance is the processing method used in CIs. Therefore, research is focused on designing different speech processing methods. The evaluation of these strategies is subject to variability as it is usually performed with cochlear implant recipients. Hence, an objective method for the evaluation would give more robustness compared to the tests performed with CI patients. This paper proposes a method to evaluate signal processing strategies for CIs based on a hidden markov model speech recognizer. Two signal processing strategies for CIs, the Advanced Combinational Encoder (ACE) and the Psychoacoustic Advanced Combinational Encoder (PACE), have been compared in a phoneme recognition task. Results show that PACE obtained higher recognition scores than ACE as found with CI recipients.}, address = {Antwerp, Belgium}, author = {Nogueira, Waldo and Harczos, Tamás and Edler, Bernd and Ostermann, Jörn and Büchner, Andreas}, booktitle = {INTERSPEECH 2007. ICSLP, Proceedings of the Eighth International Conference on Spoken Language Processing. CD-ROM}, faupublication = {yes}, isbn = {9781605603162}, keywords = {Cochlear implant; HMM; Speech recognition}, note = {UnivIS-Import:2015-04-20:Pub.2007.tech.IE.aulab.zentr.automa}, pages = {1993-1996}, peerreviewed = {unknown}, publisher = {Interspeech}, title = {{Automatic} {Speech} {Recognition} with a {Cochlear} {Implant} {Front}-{End}}, year = {2007} }
@inproceedings{faucris.111466564, abstract = {A physiological and computational model of the human auditory system has been fitted in a signal processing strategy for cochlear implants (CIs). The aim of the new strategy is to obtain more natural sound in CIs by better mimicking the human auditory system. The new strategy was built in three independent stages as proposed in [6]. First a basilar membrane motion model was substituted by the filterbank commonly used in commercial strategies. Second, an inner hair cell model was included in a commercial strategy while maintaining the original filterbank. Third, both the basilar membrane motion and the inner-hair cell model were included in the commercial strategy. This paper analyses the properties and presents results obtained with CI recipients for each algorithm designed. }, address = {Lyon}, author = {Nogueira, Waldo and Kátai, András and Harczos, Tamás and Klefenz, Frank Markus and Büchner, Andreas and Edler, Bernd}, booktitle = {Annual International Conference of the IEEE Engineering in Medicine and Biology Society EMBC}, doi = {10.1109/IEMBS.2007.4353244}, faupublication = {yes}, isbn = {1424407885}, note = {UnivIS-Import:2015-04-20:Pub.2007.tech.IE.aulab.zentr.anaudi}, pages = {4127-4130}, peerreviewed = {unknown}, publisher = {Institute of Electrical and Electronics Engineers}, title = {{An} {Auditory} {Model} based {Strategy} for {Cochlear} {Implants}}, year = {2007} }
@inproceedings{faucris.227179313, author = {Edler, Bernd}, booktitle = {Proceedings of the First International Conference on Communications and Electronics (ICCE 06)}, faupublication = {yes}, peerreviewed = {unknown}, title = {{Aliasing} {Reduction} for {Gain} {Control} with {Critically} {Sampled} {Filter} {Banks}}, venue = {Hanio, Vietnam}, year = {2006} }
@inproceedings{faucris.227178828, abstract = {Current speech processing in cochlear implants use a filterbank to analyse audio signals into several frequency bands, each associated with one electrode. Because the processing is performed on input signal blocks of fixed sizes, the filterbank provides a unique time-frequency resolution to represent the various signal features. However, different components of audio signals may require different time-frequency resolutions for an accurate representation and perception. In this paper we investigate the influence on speech intelligibility in cochlear implants users when filterbanks with different time-frequency resolutions are used. In order to represent all signal features accurately, an adaptive filterbank has been developed that accepts input blocks of different sizes. The different resolutions required are achieved by adequately switching between block sizes depending on the input signal characteristics. The filterbank was incorporated into the commercial Advanced Combinational Encoder (ACE) and acutely tested on six cochlear implant recipients.}, author = {Albalate, Amparo and Nogueira, Waldo and Edler, Bernd and Büchner, Andreas}, booktitle = {2006 IEEE Biomedical Circuits and Systems Conference}, date = {2006-11-29/2006-12-01}, doi = {10.1109/BIOCAS.2006.4600345}, faupublication = {yes}, isbn = {978-1-4244-0436-0}, keywords = {Cochlear implants; Adaptive filters; Transient analysis; Speech; Electrodes; Implants; Signal resolution}, pages = {210-213}, peerreviewed = {unknown}, publisher = {IEEE}, title = {{Signal} {Analysis} by using {Adaptive} {Filterbanks} in {Cochlear} {Implants}}, url = {http://www.tnt.uni-hannover.de/papers/data/2006aawn01.pdf}, venue = {London}, year = {2006} }
@inproceedings{faucris.203736045, address = {Vienna, Austria}, author = {Nogueira, Waldo and Edler, Bernd and Frohne-Büchner, Caroline and Brendel, Martina and Büchner, Andreas}, booktitle = {CI-2006 - 9th International Conference on Cochlear Implants}, faupublication = {yes}, note = {UnivIS-Import:2018-09-06:Pub.2006.tech.IE.aulab.zentr.sinuso}, pages = {-}, peerreviewed = {unknown}, publisher = {CI-2006}, title = {{Sinusoidal} {Analysis} of {Audio} for {Current} {Steering} {Strategies} in {Cochlear} {Implants}}, year = {2006} }
@incollection{faucris.116478824, address = {Niedersachsen}, author = {Nogueira, Waldo and Edler, Bernd}, booktitle = {Technologie-Informationen}, edition = {6}, faupublication = {yes}, note = {UnivIS-Import:2015-04-20:Pub.2006.tech.IE.aulab.zentr.audios}, pages = {6}, peerreviewed = {unknown}, publisher = {IEEE}, title = {{Audiosignalverarbeitung} für {Cochlea}-{Implantate}}, volume = {4}, year = {2006} }
@incollection{faucris.111525524, abstract = {Standard video compression techniques apply motion-compensated prediction combined with transform coding of the prediction error. In the context of prediction with fractional-pel motion vector resolution it was shown, that aliasing components contained in an image signal are limiting the prediction efficiency obtained by motion compensation. In order to consider aliasing, quantization and motion estimation errors, camera noise, etc., we analytically developed a two dimensional (2D) non-separable interpolation filter, which is independently calculated for each frame by minimizing the prediction error energy. For every fractional-pel position to be interpolated, an individual set of 2D filter coefficients is determined. Since transmitting filter coefficients as side information results in an additional bit rate, which is almost constant for different image resolutions and total bit rates, the loss in coding gain increases when total bit rates sink. Therefore, we developed an algorithm, which regards the non-separable two-dimensional filter as a polyphase filter. For each frame, predicting the interpolation filter impulse response through evaluation of the polyphase filter, we only have to encode the prediction error of the filter coefficients.
@inproceedings{faucris.108837564, abstract = {Current speech processing strategies for cochlear implants use a filterbank which decomposes the audio signals into multiple frequency bands each associated with one electrode. Pitch perception with cochlear implants is related to the number of electrodes inserted in the cochlea and to the rate of stimulation of these electrodes. The filterbank should, therefore, be able to analyze the time-frequency features of the audio signals while also exploiting the time-frequency features of the implant. This study investigates the influence on speech intelligibility in cochlear implant users when filterbanks with different time-frequency resolutions are used. Three filterbanks, based on the structure of a wavelet packet transform but using different basis functions, were designed. The filterbanks were incorporated into a commercial speech processing strategy and were tested on device users in an acute study. © 2006 IEEE.}, address = {Toulouse, France}, author = {Nogueira, Waldo and Giese, Andreas and Edler, Bernd and Büchner, Andreas}, booktitle = {Int. Conf. on Acoustics, Speech, and Signal Processing}, doi = {10.1109/ICASSP.2006.1661227}, faupublication = {yes}, isbn = {142440469X}, note = {UnivIS-Import:2015-04-20:Pub.2006.tech.IE.aulab.zentr.wavele}, pages = {V121-V124}, peerreviewed = {unknown}, publisher = {IEEE}, title = {{Wavelet} {Packet} {Filterbank} for {Speech} {Processing} {Strategies} in {Cochlear} {Implants}}, url = {http://www.tnt.uni-hannover.de/papers/data/nogueira-icassp-2006.pdf}, year = {2006} }
@inproceedings{faucris.227181988, author = {Edler, Bernd and Nguyen, Dieu Thanh and Ostermann, Jörn and et al.}, author_hint = {Vatis Y., Edler B, T. Nguyen D., Ostermann J.}, booktitle = {ITU-T SGI 6/Q.6 Doc. VCEG-Z17}, faupublication = {yes}, peerreviewed = {unknown}, support_note = {Author relations incomplete. You may find additional data in field 'author{\_}hint'}, title = {{Two}-dimensional non-separable {Adaptive} {Wiener} {Interpolation} {Filter} for {H}.264/{AVC}}, venue = {Busan, South Korea}, year = {2005} }
@article{faucris.227180526, abstract = {}, author = {Nogueira, Waldo and Büchner, Andreas and Lenarz, Tomas and Edler, Bernd}, doi = {10.1155/ASP.2005.3044}, faupublication = {yes}, journal = {EURASIP Journal on Advances in Signal Processing}, pages = {3044-3059}, peerreviewed = {Yes}, title = {{A} {Psychoacoustic} {NofM}-type {Speech} {Coding} {Strategy} for {Cochlear} {Implants}}, volume = {127}, year = {2005} }We describe a new signal processing technique for cochlear implants using a psychoacoustic-masking model. The technique is based on the principle of a so-called "NofM" strategy. These strategies stimulate fewer channels () per cycle than active electrodes (NofM;). In "NofM" strategies such as ACE or SPEAK, only the channels with higher amplitudes are stimulated. The new strategy is based on the ACE strategy but uses a psychoacoustic-masking model in order to determine the essential components of any given audio signal. This new strategy was tested on device users in an acute study, with either 4 or 8 channels stimulated per cycle. For the first condition (4 channels), the mean improvement over the ACE strategy was . For the second condition (8 channels), no significant difference was found between the two strategies.
@inproceedings{faucris.114350104, abstract = {A source coding algorithm based on the classic Markov model is presented, which uses Vector Quantization and arithmetic coding in conjunction with a dynamically adapted context of previously coded vector indices. The core of this algorithm is the numerically optimized mapping from a large number of source states to a small number of different code tables. This enables its application to audio coding, where it provides higher efficiency than the quantization and lossless coding used in MPEG-AAC.}, address = {Barcelona}, author = {Edler, Bernd and Meine, Nikolaus}, booktitle = {118th AES Convention}, faupublication = {yes}, isbn = {9781604234848}, note = {UnivIS-Import:2015-04-20:Pub.2005.tech.IE.aulab.zentr.improv}, pages = {Preprint 6468}, peerreviewed = {unknown}, publisher = {Audio Engineering Society}, title = {{Improved} {Quantization} and {Lossless} {Coding} for {Subband} {Audio} {Coding}}, year = {2005} }
@inproceedings{faucris.108831184, abstract = {In the context of prediction with fractional-pel motion vector resolution it was shown, that aliasing components contained in an image signal are limiting the prediction accuracy obtained by motion compensation. In order to consider aliasing, quantisation and motion estimation errors, camera noise, etc., we analytically developed a two-dimensional (2D) non-separable interpolation filter, which is calculated for each frame independently by minimising the prediction error energy. For every fractional-pel position to be interpolated, an individual set of 2D filter coefficients is determined. As a result, a coding gain of up to 1,2 dB for HDTV-sequences and up to 0,5 dB for CIF-sequences compared to the standard H.264/AVC is obtained. }, address = {Genova, Italy}, author = {Vatis, Yuri and Edler, Bernd and Nguyen, Dieu Thanh and Ostermann, Jörn}, booktitle = {Proc. ICIP 2005, IEEE International Conference on Image Processing}, doi = {10.1109/ICIP.2005.1530200}, faupublication = {yes}, isbn = {0-7803-9134-9}, note = {UnivIS-Import:2015-04-20:Pub.2005.tech.IE.aulab.zentr.motion}, pages = {894-897}, peerreviewed = {unknown}, title = {{Motion}-and {Aliasing}-compensated {Prediction} using a two-dimensional non-separable {Adaptive} {Wiener} {Interpolation} {Filter}}, volume = {2}, year = {2005} }
@inproceedings{faucris.108830304, abstract = {Current speech processing strategies for cochlear implants are based on decomposing the audio signals into multiple frequency bands, each one associated with one electrode. However, these bands are relatively wide to accurately encode tonal components of audio signals. To improve the encoding of tonal components and performance in cochlear implants, a new signal processing strategy has been developed. The technique is based on the principle of a so-called NofM strategy. These strategies stimulate fewer channels (N) per cycle than active electrodes (M) (NofM; N < M). However, the new strategy incorporates a fundamental frequency estimator which is used to emphasize the periodic structure of tonal components. The new technique was acutely tested on cochlear implant recipients. First intelligibility tests showed similar performance in speech perception between the new strategy and a standard NofM strategy.}, address = {Barcelona}, author = {Büchner, Andreas and Edler, Bernd and Nogueira, Waldo}, booktitle = {118th AES Convention}, faupublication = {yes}, isbn = {9781604234848}, note = {UnivIS-Import:2015-04-20:Pub.2005.tech.IE.aulab.zentr.fundam}, pages = {Preprint 6515}, peerreviewed = {unknown}, publisher = {Audio Engineering Society}, title = {{Fundamental} {Frequency} {Coding} in {NofM} {Strategies} for {Cochlear} {Implants}}, year = {2005} }
@article{faucris.108826784, abstract = {We describe a new signal processing technique for cochlear implants using a psychoacoustic-masking model. The technique is based on the principle of a so-called "NofM" strategy. These strategies stimulate fewer channels (N) per cycle than active electrodes (NofM; N < M). In "NofM" strategies such as ACE or SPEAK, only the N channels with higher amplitudes are stimulated. The new strategy is based on the ACE strategy but uses a psychoacoustic-masking model in order to determine the essential components of any given audio signal. This new strategy was tested on device users in an acute study, with either 4 or 8 channels stimulated per cycle. For the first condition (4 channels), the mean improvement over the ACE strategy was 17%. For the second condition (8 channels), no significant difference was found between the two strategies.}, author = {Nogueira, Waldo and Büchner, Andreas and Lenarz, Thomas and Edler, Bernd}, doi = {10.1155/ASP.2005.3044}, faupublication = {yes}, journal = {EURASIP J APPL SIG P}, keywords = {ACE; Cochlear implant; Masking; NofM; Psychoacoustic model; Speech coding}, note = {UnivIS-Import:2015-04-20:Pub.2005.tech.IE.aulab.zentr.apsych}, pages = {3044-3059}, peerreviewed = {unknown}, title = {{A} {Psychoacoustic} "{NofM}"-type {Speech} {Coding} {Strategy} for {Cochlear} {Implants}}, volume = {127}, year = {2005} }
@article{faucris.227182231, author = {Edler, Bernd}, faupublication = {yes}, journal = {NTZ, Telekommunikation und Informationstechnik}, peerreviewed = {unknown}, title = {{Audiocodierung} in {MPEG}-4}, year = {2004} }
@inproceedings{faucris.227182961, abstract = {Sinusoidal modelling forms the base of parametric audio coding systems, like MPEG-4 HILN, where it is combined with noise and transient models. A parametric encoder decomposes the audio signal into components that are described by appropriate models and represented by model parameters. To achieve efficient coding at very low bitrates, selection of the perceptually most relevant signal components (e.g. sinusoids) is essential, as only a limited number of component parameters can be conveyed in the bitstream. Various strategies for sinusoidal component selection have been proposed in the literature. This paper introduces a new, loudness-based strategy and tries to compare the different strategies using objective and subjective criteria.
@book{faucris.227182716, author = {Nishiguchi, Masayuki and Edler, Bernd}, editor = {Pereira F, Ebrahimi T}, faupublication = {yes}, isbn = {0-13-61621-4}, pages = {451-485}, peerreviewed = {unknown}, publisher = {Prentice Hall}, title = {{Speech} {Coding}}, year = {2002} }
@article{faucris.227182472, abstract = {This paper proposes a versatile perceptual audio coding method that achieves high compression ratios and is capable of low encoding/decoding delay. It accommodates a variety of source signals (including both music and speech) with different sampling rates. It is based on separating irrelevance and redundancy reductions into independent functional units. This contrasts traditional audio coding where both are integrated within the same subband decomposition. The separation allows for the independent optimization of the irrelevance and redundancy reduction units. For both reductions, we rely on adaptive filtering and predictive coding as much as possible to minimize the delay. A psycho-acoustically controlled adaptive linear filter is used for the irrelevance reduction, and the redundancy reduction is carried out by a predictive lossless coding scheme, which is termed weighted cascaded least mean squared (WCLMS) method. Experiments are carried out on a database of moderate size which contains mono-signals of different sampling rates and varying nature (music, speech, or mixed). They show that the proposed WCLMS lossless coder outperforms other competing lossless coders in terms of compression ratios and delay, as applied to the pre-filtered signal. Moreover, a subjective listening test of the combined pre-filter/lossless coder and a state-of-the-art perceptual audio coder (PAC) shows that the new method achieves a comparable compression ratio and audio quality with a lower delay.
@inproceedings{faucris.227183204, abstract = {The HILN (Harmonic and Individual Lines plus Noise) MPEG-4 parametric audio coding tool allows efficient representation of general audio signals at very low bit rates. Therefore possible applications include transmission over IP or wireless channels which are both characterised by specific transmission error models. On the other hand, since parametric audio coding is a relatively new technique compared to transform coding and CELP speech coding, there have been only very limited investigations on HILN`s behaviour in error prone environments. In this paper we present an analysis of error sensitivities and approaches to error protection and concealment.
@inproceedings{faucris.227184664, abstract = {A novel concept for perceptual audio coding is presented which is based on the combination of a pre- and post-filter, controlled by a psychoacoustic model, with a transform coding scheme. This paradigm allows modeling of the temporal and spectral shape of the masked threshold with a resolution independent of the used transform. By using frequency warping techniques the maximum possible detail for a given filter order can be made frequency-dependent and thus better adapted to the human auditory system. The filter coefficients are represented efficiently by LSF parameters which can be adaptively interpolated over time. First experiments with a system obtained by extending an existing transform codec showed that this approach can significantly improve the performance for speech signals, while the performance for other signals remained the same.
@inproceedings{faucris.227184421, abstract = {For very low bit rate audio coding applications in mobile communications or on the Internet, parametric audio coding has evolved as a technique complementing the more traditional approaches. These are transform codecs originally designed for achieving CD-like quality on one hand, and specialized speech codecs on the other hand. Both of these techniques usually represent the audio signal waveform in a way such that the decoder output signal gives an approximation of the encoder input signal, while taking into account perceptual criteria. Compared to this approach, in parametric audio coding the models of the signal source and of human perception are extended. The source model is now based on the assumption that the audio signal is the sum of "components," each of which can be approximated by a relatively simple signal model with a small number of parameters. The perception model is based on the assumption that the sound of the decoder output signal should be as similar as possible to that of the encoder input signal. Therefore, the approximation of waveforms is no longer necessary. This approach can lead to a very efficient representation. However, a suitable set of models for signal components, a good decomposition, and a good parameter estimation are all vital for achieving maximum audio quality. We give an overview on the current status of parametric audio coding developments and demonstrate advantages and challenges of this approach. Finally, we indicate possible directions of further improvements.
@inproceedings{faucris.227184177, author = {Edler, Bernd and Purnhagen, Heiko and Meine, Nikolaus}, booktitle = {109th AES Convention}, faupublication = {yes}, pages = {Preprint 5177}, peerreviewed = {unknown}, title = {{Speeding} up {HILN} - {MPEG}-4 {Parametric} {Audio} {Encoding} with {Reduced} {Complexity}}, url = {https://aes2.org/publications/elibrary-page/?id=9161}, year = {2000} }
@inproceedings{faucris.227183933, abstract = {Recently a new concept for perceptual audio coding was presented, which is based on a prefilter in the encoder and a corresponding postfilter in the decoder, both controlled by a psychoacoustic model. It enables individual selection of spectral and temporal resolutions for irrelevancy reduction and redundancy reduction. This paper addresses problems related to the efficient transmission of the filter parameters and presents techniques for efficient temporal and spectral modeling of masked thresholds using linear time-varying filters.:
@inproceedings{faucris.227183690, abstract = {}, author = {Schuller, Gerald and Edler, Bernd and Doser, Adele}, booktitle = {9th IEEE DSP Workshop}, faupublication = {yes}, peerreviewed = {unknown}, title = {{A} {Method} for {Alias} {Reduction} in {Cascaded} {Filter} {Banks}}, year = {2000} }This paper shows a new way to reduce aliasing in crit- ically sampled cascaded filter bank structures. Un- like standard tree structured methods, which lead to many aliasing components in the final subbands, our approach reduces the effect by canceling aliasing ele- ments among subbands. Our interest lies in compres- sion applications where we can apply the scheme to obtain an unequal or non-uniform band splitting using uniform cosine modulated filter banks. In an example it is shown that a reduction in aliasing of over 40 dB compared to a traditional tree structured filter bank can be achieved.
@inproceedings{faucris.227183448, author = {Edler, Bernd and Baumgarten, Nina}, booktitle = {DFG-Abschlu{\"s}kolloquium}, faupublication = {yes}, peerreviewed = {unknown}, title = {{Ein} {Psychophysiologisches} {Gehörmodell} zur {Nachbildung} von {Wahrnehmungsschwellen} für die {Audiocodierung}}, year = {2000} }
@article{faucris.227184906, abstract = {While previous MPEG Audio standards mainly were focused on the representation of audio signals close to or equal to CD quality, the new MPEG-4 Audio standard extends the range of applicability towards significantly lower bit rates. Furthermore it offers extended functionalities for the representation of natural and even synthetic audio signals in an object oriented fashion. This paper gives a brief overview on the complete audio part of the MPEG-4 standard and more detailed information on its parts related to speech coding.
}, author = {Edler, Bernd}, faupublication = {yes}, journal = {International Journal of Speech Technology}, pages = {289-303}, peerreviewed = {unknown}, title = {{Speech} {Coding} in {MPEG}-4}, volume = {2}, year = {1999} }
@inproceedings{faucris.227185638, abstract = {In diesem Beitrag wird ein Verfahren zur Codierung von Audiosignalen mit Datenraten zwischen 4 kbit/s und 16 kbit/s vorgestellt. Es beruht auf einer objektbasierten Darstellung des Eingangssignals durch Überlagerung verschiedener Teilquellensignale. Hierfür werden geeignete Teilquellenmodelle eingeführt. Anschließend wird eine Analyse/Synthese-Struktur beschrieben, die zur Extraktion der Teilquellensignale, zur Auswahl geeigneter Teilquellenmodelle und zur modellbasierten Parameterschätzung geeignet ist. Besonders wird auch darauf eingegangen, wie sich hierbei und bei der Codierung der zu übertragenden Parameter Wahrnehmungseigenschaften des menschlichen Gehörs ausnutzen lassen. Weiterhin wird die Effizienz dieses Verfahrens mit anderen Codierverfahren verglichen, welche für Anwendungen im gleichen Datenbereich verfügbar sind.
@inproceedings{faucris.227185394, abstract = {An Object-Based Analysis/Synthesis Audio Coder designed for bit rates between 4 and 16 kbit/s is presented. The coding is based on the decomposition of the input signal into audio objects which are described by appropriate source models and represented by model parameters. Object models for sinusoids, harmonic tones, and noise are utilized.
@inproceedings{faucris.227185150, abstract = {Recently an audio coding technique based on a signal representation by parameters describing harmonic tones, individual sinusoidal components, and noise components has been presented, which shows its highest efficiency at very low bit rates. In this paper, concepts for combination with other techniques for extending the applicability at higher bit rates and for improving the quality in the presence of speech are presented.
@article{faucris.227186366, abstract = {During December 1995, subjective tests were carried out by members of the Moving Picture Experts Group (MPEG, ISO/JTC1/SC29/WG11) to select the proposed technology for inclusion in the audio part of the new MPEG-4 standard. The new standard addresses coding for more than just the functionality of data rate compression. Material coded at very low bit-rates is also included. Thus, different testing methodologies were applied, according to ITU-R Rec. BS 1116 for a bit-rate of 64 kbit/s per channel and according to ITU-T Rec. P.80 for lower bit-rates or functionalities other than data rate compression. Proposals were subjectively tested for coding efficiency, error resilience, scalability and speed change: a subset of the MPEG-4 ‘functionalities’. This paper describes how two different evaluation methods were used and adjusted to fit the different testing requirements. This first major effort to test coding schemes at low bit-rates proved successful. Based on the test results, decisions for MPEG-4 technology were made.This was the first opportunity for MPEG members to carry out tests on the submitted functionalities. In the process, much was learnt. As a result, some suggestions are made to improve the way new functionalities can be subjectively evaluate}, author = {Contin, Laura and Edler, Bernd and Meares, David and Schreiner, Peter}, doi = {10.1016/S0923-5965(97)00005-2}, faupublication = {yes}, journal = {Signal Processing-Image Communication}, pages = {327-342}, peerreviewed = {unknown}, title = {{Tests} on {MPEG}-4 {Audio} {Codec} {Proposals}}, volume = {9}, year = {1997} }
@inproceedings{faucris.227186124, author = {Edler, Bernd}, booktitle = {4th International Workshop on Systems, Signals and Image Processing}, faupublication = {yes}, peerreviewed = {unknown}, title = {{Overview} on the {Current} {Development} of {MPEG}-4 {Audio} {Coding}}, year = {1997} }
@inproceedings{faucris.227185881, author = {Edler, Bernd}, booktitle = {14th International AES Conference "internet.aes.org"}, faupublication = {yes}, peerreviewed = {unknown}, title = {{Very} {Low} {Bit} {Rate} {Audio} {Coding} {Developement}}, year = {1997} }
@inproceedings{faucris.227186854, abstract = {An analysis/synthesis audio codec (ASAC) is presented, which allows the coding of audio signals at very low-bit rates for applications, such as mobile communication or multimedia database access via modem and analog telephone lines. Compression with bit rates between 6 kbit/s and 24 kbit/s is addressed. Furthermore, the implementation of special effects, such as independent pitch change and speed change in the decoder, are described.
@inproceedings{faucris.227186610, abstract = {After a short survey of the goals of the current standardization activities within MPEG-4 Audio, an introduction to the concept of verification model (VM) based codec development is given, and the current status of the VM development is described. The results of listening tests and core experiments are addressed briefly in order to allow a comparison of the expected codec efficiency with that of existing standards.
@article{faucris.227187098, abstract = {A window design and fast algorithm for the overlapping block transform (OBT) of size N/spl times/L are presented. The presented algorithm for the OBT reduces the calculation complexity to an N/spl times/N transform with a fast algorithm and a simple preprocessing including windowing. A signal-independent window optimization strategy is introduced for image coding application. Results for a first-order Markov model and an image coding experiment show, that the coding gains of the optimized OBTs increase and blocking effects decrease with increasing window length L. A comparison with DCT-coding shows that the OBT, which has a slightly increased realization complexity, provides higher coding gain and a significant blocking effect reduction.
@article{faucris.227187341, abstract = {A common theory of lapped orthogonal transforms (LOTs) and critically sampled filter banks, called L into N coding (LINC), is presented. The theory includes a unified analysis of both coding methods and identity relations between the transform, inverse transform, analysis filter bank, and synthesis filter bank. A design procedure for LINC analysis/synthesis systems, which satisfy the conditions for perfect reconstruction, is developed. The common LINC theory is used to define an ideal LINC system which is used, together with the power spectral density of the input signal, to calculate theoretical bounds for the coding gain. A generalized overlapping block transform (OBT) with time domain aliasing cancellation (TDAC) is used to approximate the ideal LINC. A generalization of the OBT includes multiple block overlap and additional windowing. A recursive design procedure for windows of arbitrary lengths is presented. The coding gain of the generalized OBT is higher than that of the Karhunen-Loeve transform (KLT) and close to the theoretical bounds for LINC. In the case of image coding, the generalized OBT reduces the blocking effects when compared with the DCT.
@article{faucris.227188313, abstract = {The general conditions of exact reconstruction and a recursive design procedure for lapped orthogonal transform (LOT) with arbitrary length of overlapping are presented. It is shown that LOT can be realized with any standard block transform, discrete cosine transform (DCT), for example, and an additional processing. This processing must also satisfy the same conditions for exact reconfigurations and it may be pretransform processing in the time domain or post-transform processing in the transform domain. In a few examples it is shown that the LOT has a higher coding gain and smaller blocking effects then DCT. With the proposed LOT design procedure, two optimizations, the coding gain maximization and the blocking effect minimization, are presented and compared.
@inproceedings{faucris.227188071, abstract = {Perceptual coding techniques have recently been applied successfully to high-quality coding of digital audio signals. The basic perceptual coding system uses an analysis/synthesis system to map the time-domain data into a number of frequency-domain channels. A perceptual model is used to estimate the amount of noise shaping needed in order to avoid any audible noise due to the quantization of the frequency domain data. Different filterbanks have been applied to perceptual coding. A comparison of different filterbanks shows that there is no performance penalty for hybrid filterbanks compared to other solutions.
@article{faucris.227187827, author = {Edler, Bernd}, doi = {10.1049/el:19920697}, faupublication = {yes}, journal = {Electronics Letters}, pages = {1104-1105}, peerreviewed = {Yes}, title = {{Aliasing} {Reduction} in {Subbands} of {Cascaded} {Filter} {Banks} with {Decimation}}, volume = {28}, year = {1992} }
@inproceedings{faucris.227187585, author = {Brandenburg, Karlheinz and Sporer, Thomas and Edler, Bernd}, booktitle = {EUSIPCO '92}, faupublication = {yes}, peerreviewed = {unknown}, title = {{The} {Use} of {Multirate} {Filter} {Banks} for {High} {Quality} {Digital} {Audio}}, year = {1992} }
@inproceedings{faucris.115735224, address = {San Diego}, author = {Brandenburg, Karlheinz and Eberlein, K and Herre, Jürgen and Edler, Bernd}, booktitle = {IEEE ISCAS}, faupublication = {yes}, note = {Invited Talks at Conferences etc.}, pages = {-}, peerreviewed = {unknown}, publisher = {IEEE}, title = {{Comparison} of {Filterbanks} for {High} {Quality} {Audio} {Coding}}, venue = {San Diego}, year = {1992} }
@article{faucris.227188799, author = {Edler, Bernd}, doi = {10.1515/FREQ.1989.43.9.252}, faupublication = {yes}, journal = {Frequenz}, pages = {252-256}, peerreviewed = {Yes}, title = {{Codierung} von {Audiosignalen} mit überlappender {Transformation} und adaptiven {Fensterfunktionen}}, volume = {43}, year = {1989} }
@inproceedings{faucris.227188557, abstract = {Das hier vorgestellte Verfahren der Bitratenreduktion für Audiosignale basiert auf überlappenden Transformationen mit „Time Domain Aliasing Cancellation“, deren Fensterfunktionen und Transformationslängen in Abhängigkeit vom Eingangssignal umgeschaltet werden. Die adaptive Fensterung verbessert das Verhalten der Transformationscodierung mit überlappenden Blöcken, die sich durch einen hohen Codierungsgewinn auszeichnet, beim Auftreten von Impulsen und Amplitudensprüngen im Eingangssignal.
@inproceedings{faucris.227180284, author = {Edler, Bernd}, booktitle = {8. ITG-Fachtagung Hörrundfunk}, faupublication = {yes}, peerreviewed = {unknown}, title = {{Prädiktive} {Teilbandcodierung} mit {Vektorquantisierung} für hochqualitative {Audiosignale}}, year = {1988} }