@INPROCEEDINGS{WB_05_Interface_Search,
AUTHOR = {W. Bainbridge and D.W. Oard and R.W. White},
TITLE = {An Interface to Search Human Movements Based on Geographic and Chronological
Metadata},
BOOKTITLE = {Proceedings of the 28th Annual International ACM SIGIR Conference
on Research and Development in Information Retrieval},
YEAR = {2005},
ABSTRACT = {Historians and scholars can better understand historic events by studying
the geographic and chronological activity of individuals who witnessed
them. A lack of adequate tools to help users study these activities
can hinder the process of learning and discovery. In this paper
we present an interface to address this problem that contains three
components: a map, a timeline, and a text representation of a survivor’s
movements. These components simultaneously provide query input (where
users can specify their needs) and dynamic results display (where
users can immediately see the effect of their decisions). The results
of a pilot study show that users reacted positively to the interface.},
OWNER = {dkw},
PDF = {pubs/WB_05_Interf_Search.pdf},
TIMESTAMP = {2006.07.17}
}
@INPROCEEDINGS{ZB_06_pygmy,
AUTHOR = {Z. Band and R.W. White},
TITLE = {Pygmy{B}rowse: A Small Screen Tree Brower},
BOOKTITLE = {{CHI} '06: {CHI} '06 Extended Abstracts on Human Factors in Computing
Systems},
YEAR = {2006},
PAGES = {514-519},
ADDRESS = {Montréal, Québec, Canada},
PUBLISHER = {ACM Press, New York, NY, USA},
ABSTRACT = {We present PygmyBrowse, a browser that allows users to navigate a
tree data structure in a limited amount of display space. A pilot
evaluation of PygmyBrowse was conducted, and results suggest that
it reduces task completion times and increases user satisfaction
over two alternative node-link tree browsers.},
DOI = {http://doi.acm.org/10.1145/1125451.1125562},
OWNER = {dkw},
PDF = {pubs/ZB_06_pygmy.pdf},
TIMESTAMP = {2006.08.01}
}
@INPROCEEDINGS{WB_06_Min_Bayes_Risk,
AUTHOR = {W. Byrne},
TITLE = {Minimum {B}ayes Risk Estimation and Decoding in Large Vocabulary
Continuous Speech Recognition},
BOOKTITLE = {Proceedings of the Institute of Electronics, Information, and Communication
Engineers, Japan – Special Section on Statistical Modeling for Speech
Processing, E89-D(3)},
YEAR = {2006},
ADDRESS = {Japan},
MONTH = {March},
ABSTRACT = {Minimum Bayes risk estimation and decoding strategies based on lattice
segmentation techniques can be used to refine large vocabulary continuous
speech recognition systems through the estimation of the parameters
of the underlying hidden Markov models and through the identification
of smaller recognition tasks which provides the opportunity to incorporate
novel modeling and decoding procedures in LVCSR. These techniques
are discussed in the context of going ‘beyond HMMs’, howing in particular
that this process of subproblem identification makes it possible
to train and apply small-domain binary pattern classifiers, such
as Support Vector Machines, to large vocabulary continuous speech
recognition.},
OWNER = {dkw},
PDF = {pubs/WB_06_Min_Bayes_Risk_Decod.pdf},
TIMESTAMP = {2006.07.15}
}
@INPROCEEDINGS{WB_04_Min_Bayes_Risk,
AUTHOR = {W. Byrne},
TITLE = {Minimum {B}ayes Risk Estimation and Decoding in Large Vocabulary
Continuous Speech Recognition},
BOOKTITLE = {Proceedings of the ATR Workshop -- Beyond HMMs},
YEAR = {2004},
ADDRESS = {Kyoto, Japan},
MONTH = {December},
ABSTRACT = {Minimum risk estimation and decoding strategies based on lattice segmentation
techniques can be used to refine large vocabulary continuous speech
recognition systems through the estimation of the parameters of
the underlying hidden Mark models and through the identification
of smaller recognition tasks which provides the opportunity to incorporate
novel modeling and decoding procedures in LVCSR. These techniques
are discussed in the context of going beyond HMMs.},
OWNER = {dkw},
PDF = {pubs/WB_04_Min_Bay_BeyondHMMs.pdf},
TIMESTAMP = {2006.07.15}
}
@ARTICLE{malach_sptrans03,
AUTHOR = {W. Byrne and D. Doermann and M. Franz and S. Gustman and J. Hajic
and D.W. Oard and M. Picheny and J. Psutka and B. Ramabhadran and
D. Soergel and T. Ward and {Wei-Jing~Zhu}},
TITLE = {Automatic Recognition of Spontaneous Speech for Access to Multilingual
Oral History Archives},
JOURNAL = {{IEEE} Transactions on Speech and Audio Processing, Special Issue
on Spontaneous Speech Processing},
YEAR = {2004},
VOLUME = {12},
PAGES = {420-435},
NUMBER = {4},
MONTH = {July},
ABSTRACT = {The MALACH project has the goal of developing the technologies needed
to facilitate access to large collections of spontaneous speech.
Its aim is to dramatically improve the state of the art in key Automatic
Speech Recognition (ASR), Natural Language Processing (NLP) technologies
for use in large-scale retrieval systems. The project leverages
a unique collection of oral history interviews with survivors of
the Holocaust that has been assembled and extensively annotated
by the Survivors of the Shoah Visual History Foundation.
This paper describes the collection, 116,000 hours of interviews in
32 languages, and the way in which system requirements have been
discerned through user studies. It discusses ASR methods for very
difficult speech (heavily accented, emotional, and elderly spontaneous
speech), including transcription to create training data and methods
for language modeling and speaker adaptation. Results are presented
for for English and Czech. NLP results are presented for named entity
tagging, topic segmentation, and supervised topic classification,
and the architecture of an integrated search system that uses these
results is described.}
}
@MISC{malach_asist_2002,
AUTHOR = {{D. Soergel (moderator)} and S. Gustman and M. Kornbluh and B. Ramabhadran
and J. Goldman},
TITLE = {Panel Discussion on Access to Large Spoken Archives: Uses and Technology},
HOWPUBLISHED = {Proc. of the 65th ASIS\&T Annual Meeting, Philadelphia, PA},
MONTH = {November},
YEAR = {2002},
NOTE = {Medford, NJ: Information Today. p. 469-470},
ABSTRACT = {With recent advances in information technology, digital archiving
is emerging as an important and practical method for capturing the
human experience. Large amounts of spoken materials and audiovisual
materials in which speech is an important component are becoming
available. This panel will discuss the uses of these materials for
education, information retrieval and dissemination, and research,
the requirements that arise from these uses, and speech recognition
and retrieval technologies being developed to meet these requirements.
These materials have tremendous potential for enriching the presentation
of information in education, newscasts and documentaries, but retrieval
from and access to these large repositories pose significant challenges.
The panel will provide an overview of these issues.},
PDF = {pubs/ASIST2002SpokenArchives.pdf}
}
@ARTICLE{VD_05_Lat_Segment,
AUTHOR = {V. Doumpiotis and W. Byrne},
TITLE = {Lattice Segmentation and Minimum {B}ayes Risk Discriminative Training
for Large Vocabulary Continuous Speech Recognition},
JOURNAL = {Speech Communication},
YEAR = {2005},
VOLUME = {2},
PAGES = {142-160},
ABSTRACT = {Lattice segmentation techniques developed for Minimum Bayes Risk decoding
in large vocabulary speech recognition tasks are used to compute
the statistics for discriminative training algorithms that estimate
HMM parameters so as to reduce the overall risk over the training
data. New estimation procedures are developed and evaluated for
small vocabulary and large vocabulary recognition tasks, and additive
performance improvements are shown relative to maximum mutual information
estimation. These relative gains are explained through a detailed
analysis of individual word recognition errors.},
OWNER = {dkw},
PDF = {pubs/VD_05_Lat_Segment.pdf},
TIMESTAMP = {2006.07.15}
}
@INPROCEEDINGS{VD_04_Pinched_Lat,
AUTHOR = {V. Doumpiotis and W. Byrne},
TITLE = {Pinched Lattice Minimum {B}ayes-risk Discriminative Training for
Large Vocabulary Continuous Speech Recognition},
BOOKTITLE = {Proceedings of the International Conference on Spoken Language Processing},
YEAR = {2004},
MONTH = {September},
ABSTRACT = {Iterative estimation procedures that minimize empirical risk based
on general loss functions such as the Levenshtein distance have
been derived as extensions of the Extended Baum Welch algorithm.
While reducing expected loss on training data is a desirable training
criterion, these algorithms can be difficult to apply. They are
unlike MMI estimation in that they require an explicit listing of
the hypotheses to be considered and in complex problems such lists
tend to be prohibitively large. To overcome this difficulty, modeling
techniques originally developed to improve search efficiency in
Minimum Bayes Risk decoding can be used to transform these estimation
algorithms so that exact update, risk minimization procedures can
be used for complex recognition problems. Experimental results in
two large vocabulary speech recognition tasks show improvements
over conventionally trained MMIE models.},
OWNER = {dkw},
PDF = {pubs/VD_04_Pinched_lat.pdf},
TIMESTAMP = {2006.07.15}
}
@ARTICLE{VD_05_Discrim_Linear,
AUTHOR = {V. Doumpiotis and S. Tsakalidis and W. Byrne},
TITLE = {Discriminative Linear Transforms for Feature Normalization and Speaker
Adaptation in {HMM} Estimation},
JOURNAL = {{IEEE} Transactions on Speech and Audio Processing},
YEAR = {2005},
VOLUME = {13(3)},
MONTH = {May},
ABSTRACT = {Linear transforms have been used extensively for training and adaptation
of HMM-based ASR systems. Recently procedures have been developed
for the estimation of linear transforms under the Maximum Mutual
Information (MMI) criterion. In this paper we introduce discriminative
training procedures that employ linear transforms for feature normalization
and for speaker adaptive training. We integrate these discriminative
linear transforms into MMI estimation of HMM parameters for improvement
of large vocabulary conversational speech recognition systems.},
PDF = {pubs/VD_05_Discrim_linear.pdf}
}
@INPROCEEDINGS{dtsmbr_icassp03,
AUTHOR = {V. Doumpiotis and S. Tsakalidis and W. Byrne},
TITLE = {Discriminative Training for Segmental Minimum {B}ayes-Risk Decoding},
BOOKTITLE = {IEEE Conference on Acoustics, Speech and Signal Processing},
YEAR = {2003},
ORGANIZATION = {IEEE},
ABSTRACT = {A modeling approach is presented that incorporates discriminative
training procedures within segmental Minimum Bayes-Risk decoding
(SMBR). SMBR is used to segment lattices produced by a general automatic
speech recognition (ASR) system into sequences of separate decis
ion problems involving small sets of confusable words. Acoustic
models specialized to discriminate between the competing words in
these classes are then applied in subsequent SMBR rescoring passes.
Refinement of the search space that allows the use of specialized
discriminative models is shown to be an improvement over rescoring
with conventionally trained discriminative models. },
PDF = {pubs/dmtsmbr.icassp03.pdf}
}
@INPROCEEDINGS{smbdt_eurospeech03,
AUTHOR = {V. Doumpiotis and S. Tsakalidis and W. Byrne},
TITLE = {Lattice Segmentation and Minimum {B}ayes Risk Discriminative Training},
BOOKTITLE = {Proc. of the European Conference on Speech Communication and Technology
(EUROSPEECH)},
YEAR = {2003},
ABSTRACT = {Modeling approaches are presented that incorporate discriminative
training procedures in segmental Minimum Bayes-Risk decoding (SMBR).
SMBR is used to segment lattices produced by a general automatic
speech recognition (ASR) system into sequences of separate decision
problems involving small sets of confusable words. We discuss two
approaches to incorporating these segmented lattices in discriminative
training. We investigate the use of acoustic models specialized
to discriminate between the competing words in these classes which
are then applied in subsequent SMBR rescoring passes. Refinement
of the search space that allows the use of specialized discriminative
models is shown to be an improvement over rescoring with conventionally
trained discriminative models.},
PDF = {pubs/eurosp03dtsmbr.pdf}
}
@INPROCEEDINGS{franz03:_infor,
AUTHOR = {M. Franz and B. Ramabhadran and M. Picheny},
TITLE = {Information Access in Large Spoken Archives},
BOOKTITLE = {Proceedings of the ISCA Multilingual Spoken Document Retrieval Workshop},
YEAR = {2003},
ADDRESS = {Macau},
ABSTRACT = {Digital archives have emerged as the pre-eminent method for capturing
the human experience. Before such archives can be used efficiently,
their contents must be described. The scale of such archives along
with the associated cost make it impractical to provide access via
purely manual means, but automatic technologies for search in spoken
materials still have relatively limited capabilities. The NSF funded
MALACH project will use the world's largest digital archive of video
oral histories, collected by the Survivors of the Shoah Visual History
Foundation (VHF) to make a quantum leap in the ability to access
such archives by advancing the state-of-the-art in Automated Speech
Recognition (ASR), Natural Language Processing (NLP) and related
technologies. This corpus consists of over 115,000 hours of unconstrained,
natural speech from 52,000 speakers in 32 different languages, filled
with disfluencies, heavy accents, age-related coarticualtions, and
un-cued speaker and language switching. This paper discusses some
of the ASR and NLP tools and technologies that we have been building
for the English speech in the MALACH corpus. We will also discuss
this new test bed while emphasizing the unique characteristics of
this corpus.},
PDF = {pubs/sdr_0203.pdf}
}
@INPROCEEDINGS{franz03:_autom_tansc_topic_segmen_large_spoken_archiv,
AUTHOR = {M. Franz and B. Ramabhadran and T. Ward and M. Picheny},
TITLE = {Automated Transcription and Topic Segmentation of Large Spoken Archives},
BOOKTITLE = {Proceedings of {EUROSPEECH}},
YEAR = {2003},
ADDRESS = {Geneva},
MONTH = {September},
ABSTRACT = {Digital archives have emerged as the pre-eminent method for capturing
the human experience. Before such archives can be used efficiently,
their contents must be described. The scale of such archives along
with the associated content mark up cost make it impractical to
provide access via purely manual means, but automatic technologies
for search in spoken materials still have relatively limited capabilities.
The NSF-funded MALACH project will use the worldoral histories,
collected by the Survivors of the Shoah Visual History Foundation
(VHF) to make a quantum leap in the ability to access such archives
by advancing the state-of-the-art in Automated Speech Recognition
(ASR), Natural Language Processing (NLP) and related technologies.
This corpus consists of over 115,000 hours of unconstrained, natural
speech from 52,000 speakers in 32 different languages, filled with
disfluencies, heavy accents, age-related coarticulations, and un-cued
speaker and language switching. This paper discusses some of the
ASR and NLP tools and technologies that we have been building for
the English speech in the MALACH corpus. We also discuss this new
test bed while emphasizing the unique characteristics of this corpus.},
PDF = {pubs/euro03-ir.pdf}
}
@INPROCEEDINGS{VG_04_Seg-Min,
AUTHOR = {V. Goel and S. Kumar and W. Byrne},
TITLE = {Segmental Minimum {B}ayes-Risk Decoding for Automatic Speech Recognition},
BOOKTITLE = {Transactions of Speech and Audio Processing},
YEAR = {2004},
VOLUME = {12},
PAGES = {234-249},
MONTH = {May},
PUBLISHER = {IEEE},
NOTE = {Correction Available : In our recently published paper, we presented
a risk-based lattice cutting procedure to segment ASR word lattices
into smaller sub-lattices as a means to improve the efficiency of
Minimum Bayes-Risk (MBR) rescoring. In the experiments reported,
some of the hypotheses in the original lattices were inadvertently
discarded during segmentation, and this affected MBR performance
adversely. This note gives the corrected results as well as experiments
demonstrating that the segmentation process does not discard any
paths from the original lattice. http://mi.eng.cam.ac.uk/~wjb31/ppubs/smbrtsapcorr.pdf},
ABSTRACT = {Minimum Bayes-Risk (MBR) speech recognizers have been shown to yield
improvements over the search over word lattices. We present a Segmental
Minimum Bayes-Risk decoding (SMBR) framework that simplifies the
implementation of MBR recognizers through the segmentation of the
N-best lists or lattices over which the recognition is to be performed.
This paper presents lattice cutting procedures that underly SMBR
decoding. Two of these procedures are based on a risk minimization
criterion while a third one is guided by word-level confidence scores.
In conjunction with SMBR decoding, these lattice segmentation procedures
give consistent improvements in recognition word error rate (WER)
on the Switchboard corpus. We also discuss an application of risk-based
lattice cutting to multiplesystem SMBR decoding and show that it
is related to other system combination techniques such as ROVER.
This strategy combines lattices produced from multiple ASR systems
and is found to give WER improvements in a Switchboard evaluation
system.},
OWNER = {dkw},
PDF = {pubs/VG_04_Seg_Min.pdf},
TIMESTAMP = {2006.07.15}
}
@ARTICLE{JG_05_Access_Spoken,
AUTHOR = {J. Goldman and S. Renals and S. Bird and F. de Jong and M. Federico
and C. Fleischhauer and M. Kornbluth and L. Lamel and D.W. Oard
and F. Sebastiani and C. Stewart and R. Wright},
TITLE = {Accessing the Spoken Word},
JOURNAL = {International Journal on Digital Libraries},
YEAR = {2005},
VOLUME = {5},
PAGES = {287-298},
NUMBER = {4},
MONTH = {August},
ABSTRACT = {Spoken word audio collections cover many domains, including radio
and television broadcasts, oral narratives, governmental proceedings,
lectures, and telephone conversations. The collection, access and
preservation of such data is stimulated by political, economic,
cultural and educational needs. This paper outlines the major issues
in the field, reviews the current state of technology, examines
the rapidly changing policy issues relating to privacy and copyright,
and presents issues relating to the collection and preservation
of spoken audio content.},
OWNER = {dkw},
PDF = {pubs/JG_05_Access_Spoken_Word.pdf},
TIMESTAMP = {2006.07.17}
}
@INPROCEEDINGS{malach_jcdl_2002,
AUTHOR = {S. Gustman and D. Soergel and D.W. Oard and W. Byrne and M. Picheny
and B. Ramabhadran and D. Greenberg},
TITLE = {Supporting Access to Large Digital Oral History Archives},
BOOKTITLE = {Proceedings of the Joint Conference on Digital Libraries},
YEAR = {2002},
PAGES = {18--27},
MONTH = {July},
ABSTRACT = {This paper describes our experience with the creation, indexing, and
provision of access to a very large archive of videotaped oral histories
16,000 hours of digitized interviews in 32 languages from 52,000
survivors, liberators, rescuers, and witnesses of the Nazi Holocaust.
It goes on to identify a set of critical research issues that must
be addressed if we are to provide full and detailed access to collections
of this size: issues in user requirement studies, automatic speech
recognition, automatic classification, segmentation, summarization,
retrieval, and user interfaces. The paper ends by inviting others
to discuss use of these materials in their own research.},
PDF = {pubs/JCDL2002MALACH.pdf}
}
@INPROCEEDINGS{XH_06_Evid_Persp,
AUTHOR = {X. Huang and D. Soergel},
TITLE = {An Evidence Perspective on Topical Relevance Types and Its Implications
for Exploratory and Task-Based Retrieval},
BOOKTITLE = {ISIC},
YEAR = {2006},
ADDRESS = {Sydney, Australia},
MONTH = {July},
ABSTRACT = {The common view of topical relevance is limited to topic matching,
resulting in IR systems' failure to detect more complex topical
connections which are needed to respond to diversified user situations
and tasks. To reveal the complex evidential relationships involved
in topical relevance, we analyzed relevance assessments in the domain
of history that used four types of topical relevance: Direct, indirect,
context, and comparison. Each of these plays a special role in reasoning,
making a conclusive argument, or performing a task. Incorporating
these relevance types into IR systems allows users more flexibility
and a better focus on their tasks.},
OWNER = {dkw},
PDF = {pubs/XH_06_Evid_Persp.pdf},
TIMESTAMP = {2006.07.31}
}
@INPROCEEDINGS{XH_05_iConf_poster,
AUTHOR = {X. Huang and D. Soergel},
TITLE = {Evidence-Based Interpretation of Topical Relevance Types: {T}owards
a Richer Understanding of Topical Relevance},
BOOKTITLE = {Poster at Ph.D. Poster Session, i-Conference},
YEAR = {2005},
ADDRESS = {Penn State University},
OWNER = {dkw},
PDF = {pubs/XH_05_iConf_poster.pdf},
TIMESTAMP = {2006.08.02}
}
@INPROCEEDINGS{XH_04_Rel_Judges,
AUTHOR = {X. Huang and D. Soergel},
TITLE = {Relevance Judges’ Understanding of Topical Relevance Types: An Explication
of an Enriched Concept of Topical Relevance},
BOOKTITLE = {67th Annual Meeting of the American Society for Information Science
and Technology (ASIS\&T)},
YEAR = {2004},
ABSTRACT = {Despite the centrality of topical relevance in in-formation retrieval
system design and evaluation, understanding and implementation of
it is usually limited to “direct overall topical matching” between
the subject of the query and the subject of the document. The underlying
assumption is that only a single type of topical relationship is
involved. In related work, a relevance judgment instrument was developed
for the Multilingual Access to Large Spoken ArCHives project (MALACH).
It incorporates the five topical relevance types (direct relevance,
indirect/circumstantial relevance, context relevance, comparison
relevance, and pointer relevance) and was applied by four judges
to items in the MALACH test collection in Summer 2003. This paper
reports on the experiences and perceptions of the judges making
more nuanced judgments about topical relevance. The results suggest
that more than only one variable/dimension, “whether it is on topic”
as usually referred to, contributes to topical relevance, and more
than a single topical relationship type, “direct matching” as generally
assumed, play an important role in topical relevance.},
OWNER = {dkw},
PDF = {pubs/XH_04_Rel_Judges.pdf},
TIMESTAMP = {2006.07.17}
}
@INPROCEEDINGS{XH_05_Policy_Captur,
AUTHOR = {X. Huang and R.W. White},
TITLE = {Policy Capturing Models for Multi-Faceted Relevance Judgments},
BOOKTITLE = {Proceedings of the 68th ASIS\&T Annual Meeting},
YEAR = {2005},
ABSTRACT = {We applied policy capturing and bootstrapping methods to investigate
the relevance judgment process, with a particular focus on understanding
how judges summarize an overall relevance judgment from five specific
aspects of relevance. Our
data come from relevance judgments made in the development of the
MALACH (Multilingual Access to Large Spoken ArCHives) Speech Retrieval
Test Collection. We developed a linear model for each of four relevance
judges by regressing his or her overall judgments on the five specific
relevance aspects. According to these models, different judges tended
to assign different importance weights to different aspects. One
of the linear models was applied to seven new judgment sets and
was highly successful at predicting accurate overall judgments for
the seven judgment sets.},
OWNER = {dkw},
PDF = {pubs/XH_05_Policy_Cap.pdf},
TIMESTAMP = {2006.07.17}
}
@INPROCEEDINGS{DI_06_Invest_XLang,
AUTHOR = {D. Inkpen and M. Alzghool and G.J.F. Jones and D.W. Oard},
TITLE = {Investigating Cross-Language Speech Retrieval for a Spontaneous Conversational
Speech Collection},
BOOKTITLE = {Conference on Human Language Technologies and the North American
Chapter of the Ass'n for Computational Linguistics},
YEAR = {2006},
ADDRESS = {New York},
ABSTRACT = {Cross-language retrieval of spontaneous speech combines the challenges
of working with noisy automated document transcripts and language
translation. The CLEF 2005 Cross-Language Speech Retrieval (CL-SR)
task provides a standard test collection to investigate these challenges.
In our experimental investigation we show that we can improve retrieval
performance by careful selection of the term weighting scheme and
by combining the automatic transcripts with manually-assigned metadata.
We further show that online machine translation resources can be
used for topic translation to give effective CL-SR.},
OWNER = {dkw},
PDF = {pubs/DI_06_Invest_XLang.pdf},
TIMESTAMP = {2006.08.01}
}
@INPROCEEDINGS{JK_03_Search_Large_Coll,
AUTHOR = {J. Kim and D.W. Oard and D. Soergel},
TITLE = {Searching Large Collections of Recorded Speech: A Preliminary Study},
BOOKTITLE = {Annual Conference of the American Society for Information Science
and Technology},
YEAR = {2003},
ADDRESS = {Long Beach, CA},
MONTH = {April},
ABSTRACT = {This paper reports on an exploratory study of the criteria searchers
use when judging the relevance of recorded speech from radio programs
and the attributes of a recording on which those judgments are based.
Five volunteers each performed three searches using two systems
(NPR Online and SpeechBot) for three questions and judged the relevance
of the results. Data were collected through observation and screen
capture, think aloud, and interviews; coded; and analyzed by looking
for patterns. Criteria used as a basis for selection were found
to be similar to those observed in relevance studies with printed
materials, but the attributes used as a basis for assessing those
criteria were found to exhibit modality-specific characteristics.
For example, audio replay was often found to be necessary when assessing
story genre (e.g., report, interview, commentary) because of limitations
in presently available metadata. Participants reported a strong
preference for manually prepared summaries over passages extracted
from automatic speech recognition transcripts, and consequential
differences in search behavior were observed between the two conditions.
Some important implications for interface and component design are
drawn, such as the utility of summaries at multiple levels of detail
in view of the difficulty of skimming imperfect transcripts and
the potential utility of automatic speaker identification to support
authority judgments in systems.},
OWNER = {dkw},
PDF = {JK_03_Search_Large_Coll.pdf},
TIMESTAMP = {2006.07.15}
}
@INPROCEEDINGS{JK_03_User_Inter,
AUTHOR = {J. Kim and D. Soergel and D.W. Oard},
TITLE = {User Interaction in Speech and Video Retrieval: Relevance Judgment
and Query Reformulation},
BOOKTITLE = {Presented at the HCIL Annual Symposium and Open House},
YEAR = {2003},
ADDRESS = {College Park, MD, USA},
MONTH = {May},
ABSTRACT = {Speech retrieval systems are now beginning to appear as a means to
access to spoken collections (news, oral histories, phone messages,
recordings of meetings, etc.), but we do not yet understand well
how these systems will be used. The purpose of this study is to
explore the user behavior in interactive speech retrieval systems
in the context of oral histories. In particular, this study seeks
to answer such questions as: what relevance criteria searchers apply
when they select a recording or a passage, how searchers attempt
to match their query formulations to their information needs, and
what metadata or information searchers find valuable both in making
relevance judgments and in improving their queries. The study uses
qualitative research methods. Eight participants that include faculty,
Holocaust scholars, a documentary film producer, and a high school
teacher searched the Shoah Visual History Foundation's collection
that consists of 116,000 hours of 52,000 testimonies in 32 different
languages from the survivors, liberators, rescuers and witnesses
of the Holocaust. Each participant performed a series of searches
based on her/his own interests over a period of one to two weeks.
Data were collected through observation and screen capture, think
aloud, and semi-structured interviews. Coding is being done and
analyzed in order to find patterns. System and interface designers
will benefit from the findings of this study in building future
interactive speech retrieval systems. For example, the knowledge
of preferred metadata adopted by searchers may suggest what metadata
should be catalogued and to what level. The knowledge of how searchers
reformulate their queries may inform system designers what information
the system should present and what tools it should provide in order
to support interactive searching.},
OWNER = {dkw},
TIMESTAMP = {2006.08.01}
}
@INPROCEEDINGS{KNL_05_Rel_Crit,
AUTHOR = {K.N. Lawley and D. Soergel and X. Huang},
TITLE = {Relevance Criteria Used by Teachers in Selecting Oral History Materials},
BOOKTITLE = {Proceedings of the Annual Meeting of the American Society for Information
Science \&
Technology ({ASIS\&T})},
YEAR = {2005},
ADDRESS = {Charlotte, NC},
MONTH = {October},
ABSTRACT = {User-centered perspectives of relevance acknowledge the task-specific
nature of relevance assessment, but we understand little about the
retrieval and assessment tasks of teachers, a professional population
with an important purpose. We observed eight school teachers throughout
a collaborative process of designing lesson plans and searching
for appropriate oral history materials and found an array of relevance
criteria that pertain specifically to teaching. The objectives implied
in their criteria correspond to the teaching objectives described
in lesson plans and teacher interviews, including connecting with
students, representing diversity, and teaching tolerance. Our findings
suggest user-oriented design approaches that support retrieval of
instructional materials in line with the needs and knowledge of
teachers.},
OWNER = {dkw},
PDF = {pubs/KNL_05_ASIST.pdf},
TIMESTAMP = {2006.07.17}
}
@INPROCEEDINGS{KNL_05_iConf_poster,
AUTHOR = {K.N. Lawley and D. Soergel and R.W. White and X. Huang},
TITLE = {Teachers' Search for Multimedia Lesson Plan Materials: Study, Results,
and Design Implications for Oral History Archives},
BOOKTITLE = {Poster presented at i-Conference 2005: The First Conference of the
i-School Community},
YEAR = {2005},
ADDRESS = {State College, PA},
MONTH = {September},
ABSTRACT = {When teachers collect materials to use in their classrooms, they engage
in a special case of information-seeking that involves task-specific
relevance criteria and other workflow-related considerations. We
observed eight middle-school and high-school teachers as they participated
in a week-long workshop to collaborate on designing modular lesson
plans for tolerance education. The lesson plans used passages of
Holocaust survivor testimonies selected from a collection that was
gathered and cataloged by the Survivors of the Shoah Visual History
Foundation. The organization of the workshop provided a unique opportunity
to understand how the selection of oral history materials occurs
within the context of creating lesson plans.
These context-rich data describe some of the motives, preferences,
and constraints that influence how and why teachers seek and select
oral history materials for tolerance education. The lessons learned
from this workshop directly informed the design of a user interface
that supports the needs and behaviors that we observed among teachers.
Our conceptual framework draws on literature related to user-centered
relevance, task-oriented information seeking, instructional design,
and personalized instruction. Our poster will set forth the relevance
criteria, strategies, and obstacles we observed during the workshop
as well as the interface components that support teachers' workflow.},
OWNER = {dkw},
PDF = {pubs/KNL_05_iConf_poster.pdf},
TIMESTAMP = {2006.07.17}
}
@INPROCEEDINGS{BL_06_One-Sided,
AUTHOR = {B. Liu and D.W. Oard},
TITLE = {One-{S}ided Measures for Evaluating Ranked Retrieval Effectiveness
with Spontaneous Conversational Speech},
BOOKTITLE = {Poster presented at SIGIR},
YEAR = {2006},
NOTE = {accepted},
OWNER = {dkw},
TIMESTAMP = {2006.08.01}
}
@INPROCEEDINGS{LM_06_Stat_Phrase,
AUTHOR = {L. Mathias and W. Byrne},
TITLE = {Statistical Phrase-Based Speech Translation},
BOOKTITLE = {{IEEE} Conference on Acoustics, Speech and Signal Processing},
YEAR = {2006},
ABSTRACT = {A generative statistical model of speech-to-text translation is developed
as an extension of existing models of phrase-based text translation.
Speech is translated by mapping ASR word lattices to lattices of
phrase sequences which are then translated using operations developed
for text translation. Performance is reported on Chinese to English
translation of Mandarin Broadcast News.},
OWNER = {dkw},
PDF = {pubs/LM_06_Stat_Phrase.pdf},
TIMESTAMP = {2006.07.17}
}
@INPROCEEDINGS{oard04:_trans_acces_spoken_word,
AUTHOR = {D.W. Oard},
TITLE = {Transforming Access to the Spoken Word},
BOOKTITLE = {Proceedings of the International Symposium on Large-Scale Knowledge
Resources},
YEAR = {2004},
MONTH = {March},
ORGANIZATION = {Tokyo Institute of Technology},
NOTE = {http://www.coe21-lkr.titech.ac.jp/english/symposium.html},
ABSTRACT = {For thousands of years, the written word has held a special place
in our lives. In part, this results from two key characteristics:
durability and searchability. Over the past several decades, the
spoken word has gradually acquired those characteristics. In our
lifetimes, it seems reasonable to expect that trend to continue,
and indeed to accelerate, as improvements in automatic speech recognition
begin to enable large-scale access to spontaneous conversational
speech. This paper identifies four fundamental challenges that must
be overcome if we are to leverage this remarkable new capability
for the greatest benefit, briefly describes one project that is
exploring this new frontier, and then concludes by looking toward
future research on this important problem.},
PDF = {pubs/oard_IntSympLKR_2004.pdf}
}
@MISC{oard_talk_Nov2003,
AUTHOR = {D.W. Oard},
TITLE = {Speaking to the Future},
MONTH = {November},
YEAR = {2003},
SLIDES = {pubs/oard_malach_talk_nov03.ppt}
}
@MISC{oard_talk_Oct2003,
AUTHOR = {D.W. Oard},
TITLE = {Searching Spoken Word Collections},
MONTH = {October},
YEAR = {2003},
NOTE = {Presented at Columbia University},
SLIDES = {pubs/malach_columbia_oard.ppt}
}
@INPROCEEDINGS{malach_cla_tsd02,
AUTHOR = {D.W. Oard and D. Demner-Fushman and J. Hajic and B. Ramabhadran and
S. Gustman and W. Byrne and D. Soergel and B. Dorr and P. Resnik
and M. Picheny},
TITLE = {Cross-Language Access to Recorded Speech in the {MALACH} Project},
BOOKTITLE = {Proceedings of the Text, Speech, and Dialog Workshop},
YEAR = {2002},
ADDRESS = {Brno, Czech Republic},
ABSTRACT = {The MALACH project seeks to help users find information in a vast
multilingual collection of untranscribed oral history interviews.
This paper introduces the goals of the project and focuses on supporting
access by users who are unfamiliar with the interview language.
It begins with a review of the state of the art in cross-language
speech retrieval; approaches that will be investigated are then
described. Czech was selected as the first non-English language
to be supported, so results of an initial experiment with Czech/English
cross-language retrieval are reported.},
PDF = {pubs/malach_cla_tsd02.pdf},
SLIDES = {pubs/tsd2002e.ppt}
}
@INPROCEEDINGS{malach_aaai_2003,
AUTHOR = {D.W. Oard and A. Leuski},
TITLE = {Searching Recorded Speech Based on the Temporal Extent of Topic Labels},
BOOKTITLE = {Processings of AAAI Spring Symposium on Intelligent Multimedia Knowledge
Management},
YEAR = {2003},
MONTH = {March},
ABSTRACT = {Recorded speech poses unusual challenges for the design of interactive
end-user search systems. Automatic speech recognition is sufficiently
accurate to support the automated components of interactive search
systems in some applications. Recognizing useful recordings among
those nominated by the system is difficult, however, because listening
to audio is time consuming and because recognition errors and speech
disfluencies make it difficult to mitigate this time factor by skimming
automatic transcripts. Support for the browsing process based on
supervised learning for automatic classification has shown promise,
however, and a segment-then-label framework has emerged as the dominant
paradigm for applying that technique to news broadcasts. This paper
argues for a more general framework, which we call activation matrices,
that provide a flexible representation for the mapping between labels
and time. Three approaches to the generation of activation matrices
could be generated are briefly described, with the main focus of
the paper being the use of activation matrices to support search
and selection in interactive systems.},
PDF = {pubs/malach_aaai_2003.pdf}
}
@INPROCEEDINGS{oard04:_build_infor_retriev_test_collec,
AUTHOR = {D.W. Oard and D. Soergel and D. Doermann and X. Huang and G.C. Murray
and J. Wang and B. Ramabhadran and M. Franz and S. Gustman and J.
Mayfield and L. Kharevych and S. Strassel},
TITLE = {Building an Information Retrieval Test Collection for Spontaneous
Conversational Speech},
BOOKTITLE = {Proceedings of {SIGIR}'04},
YEAR = {2004},
ADDRESS = {Sheffield, U.K.},
MONTH = {July},
PUBLISHER = {ACM},
ABSTRACT = {Test collections model use cases in ways that facilitate evaluation
of information retrieval systems. This paper describes the use of
search-guided relevance assessment to create a test collection for
retrieval of spontaneous conversational speech. Approximately 10,000
thematically coherent segments were manually identified in 625 hours
of oral history interviews with 246 individuals. Automatic speech
recognition results, manually prepared summaries, controlled vocabulary
indexng, and name authority control are available for every segment.
Those features were leveraged by a team of four relevance assessors
to identify topically relevant segments for 28 topics developed
from actual user requests. Search-guided assessment yielded sufficient
interannotator agreement to support formative evaluation during
system development. Baseline results for ranked retrieval are presented
to illustrate use of the collection.},
PDF = {pubs/malach_sigir04.pdf}
}
@INPROCEEDINGS{JSO_06_Coupl_Trng_Set,
AUTHOR = {J.S. Olsson},
TITLE = {An Analysis of the Coupling between Training Set and Neighborhood
Sizes for the kNN Classifier},
BOOKTITLE = {SIGIR},
YEAR = {2006},
ABSTRACT = {We consider the relationship between training set size and the parameter
k for the k-Nearest Neighbors (kNN) classifier. When few examples
are available, we observe that accuracy is sensitive to k and that
best k tends to increase with training size. We explore the subsequent
risk that k tuned on partitions will be suboptimal after aggregation
and re-training. This risk is found to be most severe when little
data is available. For larger training sizes, accuracy becomes increasingly
stable with respect to k and the risk decreases.},
KEYWORDS = {text classification, k-Nearest Neighbors, parameter tuning, parameter
stability},
OWNER = {dkw},
PDF = {pubs/JSO_06_Coupl_Trng_Set.pdf},
TIMESTAMP = {2006.08.01}
}
@INPROCEEDINGS{JSO_05_CL_Text_Class,
AUTHOR = {J.S. Olsson and D.W. Oard and J. Hajic},
TITLE = {Cross-Language Text Classification},
BOOKTITLE = {Proceedings of the 28th Annual International ACM SIGIR Conference
on Research and Development in Information Retrieval},
YEAR = {2005},
PAGES = {645-646},
ADDRESS = {Salvador, Brazil},
MONTH = {August},
PUBLISHER = {ACM Press, New York, NY},
KEYWORDS = {cross-language text classication and topic classication},
OWNER = {dkw},
PDF = {pubs/JSO_05_CL_Text_Class.pdf},
TIMESTAMP = {2006.07.17}
}
@INPROCEEDINGS{CP_06_Recog_Emot,
AUTHOR = {C. Pietsch and B. Ramabhadran},
TITLE = {A Novel Approach to the Automatic Recognition of Emotions in Natural
Speech},
BOOKTITLE = {ISEF},
YEAR = {2006},
OWNER = {dkw},
TIMESTAMP = {2006.08.01}
}
@INPROCEEDINGS{malach_icassp04,
AUTHOR = {J. Psutka and J. Hajic and W. Byrne},
TITLE = {{ASR} for {S}lavic Languages in the {MALACH} Project},
BOOKTITLE = {IEEE Conference on Acoustics, Speech and Signal Processing},
YEAR = {2004},
ORGANIZATION = {IEEE},
NOTE = {{\em Invited Paper in Special Session on Multilingual Speech Processing}},
ABSTRACT = {The development of acoustic training material for Slavic languages
within the MALACH project is described. Initial experience with
the variety of speakers and the difficulties encountered in transcribing
Czech, Slovak, and Russian language oral history are described along
with ASR recognition results intended investigate the effectiveness
of different transcription conventions that address language specific
phenomena within the task domain.},
PDF = {pubs/JP_04_ASR_Slavic_Lang.pdf}
}
@INPROCEEDINGS{tsd03_ruasr,
AUTHOR = {J. Psutka and I. Iljuchin and P. Ircing and J.V. Psutka and V. Trejbal
and W. Byrne and J. Hajic and S. Gustman},
TITLE = {Building {LVCSR} Systems for transcription of spontaneously produced
{R}ussian witnesses in the {MALACH} project: Initial steps and first
results},
BOOKTITLE = {Proceedings of the Text, Speech, and Dialog Workshop},
YEAR = {2003},
ABSTRACT = {The MALACH project uses the world's largest digital archive of video
oral histories collected by the Survivors of the Shoah Visual History
Foundation (VHF) and attempts to access such archives by advancing
the state-of-the-art in Automatic Speech Recognition and Information
Retrieval. This paper discusses the intial steps and first results
in building large vocabulary continuous speech recognition (LVCSR)
systems for the transcription of Russian witnesses. As the third
language processed in the MALACH project (following English and
Czech), Russian has posed new ASR challenges, especially in phonetic
modeling. Although most of the Russian testimonies were provided
by native Russian survivors, the speakers come from many different
regions and countries resulting in a diverse collection of accented
spontaneous Russian speech.},
PDF = {pubs/Russian_MALACH_tsd03.pdf}
}
@INPROCEEDINGS{psutka04:_issues_annot_czech_spont_speec,
AUTHOR = {J. Psutka and P. Ircing and J. Hjic and V. Radova and J.V. Psutka
and W. Byrne and S. Gustman},
TITLE = {Issues in Annotation of the {C}zech Spontaneous Speech Corpus in
the {MALACH} Project},
BOOKTITLE = {Proceedings of the International Conference on Language Resources
and Evaluation, LREC},
YEAR = {2004},
ABSTRACT = {The paper present the issues encountered in processing spontaneous
Czech speech in the MALACH project. Specific problems connected
with a frequent occurrence of colloquial words in spontaneous Czech
are analyzed; a partial solution is proposed and experimentally
evaluated.},
PDF = {pubs/JP_04_Annot_Czech.pdf}
}
@INPROCEEDINGS{tsd03_czasr,
AUTHOR = {J. Psutka and P. Ircing and J.V. Psutka and V. Radova and W. Byrne
and J. Hajic and S. Gustman},
TITLE = {Towards automatic transcription of spontaneous {C}zech speech in
the {MALACH} project},
BOOKTITLE = {Proceedings of the Text, Speech, and Dialog Workshop},
YEAR = {2003},
ABSTRACT = {Our paper discusses the progress achieved during a one-year effort
in building the Czech LVCSR system for the automatic transcription
of spontaneously produced testimonies in the MALACH project. The
difficulty of this task stems from the highly inflectional nature
of the Czech language and is further multiplied by the presence
of many colloquial words in spontaneous Czech speech as well as
by the need to handle emotional speech filled with disfluencies,
heavy accents, age-related coarticulation and language switching.
In this paper we concentrate mainly on the acoustic modeling issues
- the proper choice of front-end paramterization, the handling of
non-speech events in acoustic modeling, and unsupervised acoustic
adaptation via MLLR. A method for selecting suitable language modeling
data is also briefly discussed.},
PDF = {pubs/Czech_Malach_tsd03_1.pdf}
}
@INPROCEEDINGS{czasr_tsd02,
AUTHOR = {J. Psutka and P. Ircing and J.V. Psutka and V. Radova and W. Byrne
and J. Hajic and S. Gustman and B. Ramabhadran},
TITLE = {Automatic Transcription of {C}zech Language Oral History in the {MALACH}
Project: Resources and Initial Experiments},
BOOKTITLE = {Proceedings of the Text, Speech, and Dialog Workshop},
YEAR = {2002},
ABSTRACT = {In this paper we describe the initial stages of the ASR component
of the MALACH project. This project will attempt to provide improved
access to the large multilingual spoken archives collected by the
Survivors of the Shoah Visual History Foundation by advancing the
state of the art in automated speech recognition. In order to train
the ASR system, it is necessary to manually transcribe a large amount
of speech data, identify the appropriate vocabulary, and obtain
relevant text for language modeling. We give a detailed description
of the speech annotation process; show the specific properties of
the spontaneous speech contained in the archives; and present baseline
speech recognition results.},
PDF = {pubs/tsd02.psutka.pdf}
}
@INPROCEEDINGS{malachczasr_eurospeech03,
AUTHOR = {J. Psutka and P. Ircing and J.V. Psutka and V. Radovic and W. Byrne
and J. Hajic and J. Mirovsky and S. Gustman},
TITLE = {Large Vocabulary {ASR} for Spontaneous {C}zech in the {MALACH} Project},
BOOKTITLE = {Proc. of the European Conference on Speech Communication and Technology
(EUROSPEECH)},
YEAR = {2003},
ABSTRACT = {This paper describes LVCSR research into the automatic transcription
of spontaneous Czech speech in the MALACH (Multilingual Access to
Large Spoken Archives) project. This project attempts to provide
improved access to the large multilingual spoken archives collected
by the Survivors of the Shoah Visual History Foundation (VHF) (www.vhf.org)
by advancing the state of the art in automated speech recognition.
We describe a baseline ASR system and discuss the problems in language
modeling that arise from the nature of Czech as a highly inflectional
language that also exhibits diglossia between its written and spontaneous
forms. The difficulties of this task are compounded by heavily accented,
emotional and disfluent speech along with frequent switching between
languages. To overcome the limited amount of relevant language model
data we use statistical techniques for selecting an appropriate
training corpus from a large unstructured text collection resulting
in significant reductions in word error rate. recognition and retrieval
techniques to improve cataloging efficiency and eventually to provide
direct access into the archive itself.},
PDF = {pubs/malachczeurosp03.pdf}
}
@INPROCEEDINGS{BR_05_Exploit,
AUTHOR = {B. Ramabhadran},
TITLE = {Exploiting Large Quantities of Spontaneous Speech for Unsupervised
Training of Acoustic Models},
BOOKTITLE = {Proceedings of {INTERSPEECH 2005}, Eurospeech},
YEAR = {2005},
OWNER = {dkw},
TIMESTAMP = {2006.08.03}
}
@MISC{BR_06_Transc_Next_Top_Model,
AUTHOR = {B. Ramabhadran},
TITLE = {{[PANEL]} Transcription's Next Top Model: Life Beyond Broadcast News
and Telephony Conversations},
HOWPUBLISHED = {{IEEE-ASRU}},
YEAR = {2005},
OWNER = {dkw},
TIMESTAMP = {2006.08.01}
}
@INPROCEEDINGS{ramabhadran03:_impac_audio_segmen_segmen_clust,
AUTHOR = {B. Ramabhadran and J. Huang and U. Chaudhari and G. Iyengar and H.J.
Nock},
TITLE = {Impact of Audio Segmentation and Segment Clustering on Automated
Transcription Accuracy of Large Spoken Archives},
BOOKTITLE = {Proceedings of {EUROSPEECH}},
YEAR = {2003},
ADDRESS = {Geneva},
MONTH = {September},
ABSTRACT = {This paper addresses the influence of audio segmentation and segment
clustering on automatic transcription accuracy for large spoken
archives. The work forms part of the ongoing MALACH project, which
is developing advanced techniques for supporting access to the world
of video oral histories collected in many languages from over 52,000
survivors and witnesses of the Holocaust. We present several audio-only
and audio-visual segmentation schemes, including two novel schemes:
the first is iterative and audio-only, the second uses audio-visual
synchrony. Unlike most previous work, we evaluate these schemes
in terms of their impact upon recognition accuracy. Results on English
interviews show the automatic segmentation schemes give performance
comparable to (exhorbitantly expensive and impractically lengthy)
manual segmentation when using a single pass decoding strategy based
on speaker-independent models. However, when using a multiple pass
decoding strategy with adaptation, results are sensitive to both
initial audio segmentation and the scheme for clustering segments
prior to adaptation: the combination of our best automatic segmentation
and clustering scheme has an error rate 8% worse (relative) to manual
audio segmentation and clustering due to the occurrence of ``speaker-impure''
segments.},
PDF = {pubs/euro03-seg.pdf}
}
@INPROCEEDINGS{ramabhadran_icassp03,
AUTHOR = {B. Ramabhadran and J. Huang and M. Picheny},
TITLE = {Towards Automatic Transcription of Large Spoken Archives - {E}nglish
{ASR} for the {MALACH} project},
BOOKTITLE = {International Conference on Acoustics, Speech, and Signal Processing},
YEAR = {2003},
ABSTRACT = {Digital archives have emerged as the pre-eminent method for capturing
the human experience. Before such archives and be used efficiently,
their content must be described. The NSF-funded MALACH project aims
to provide improved access to large spoken archives by advancing
the state-of-the-art in automatic speech recognition, information
retrieval, and related technologies for multiple languages. This
paper describes the ASR research for the English speech in the MALACH
corpus. The MALACH corpus consists of unconstrained, natural speech
filled with disfluencies, heavy accents, age-related coarticulation,
uncued speaker and language switching, and emotional speech collected
in the form of interviews from over 52,000 speakers in 32 languages.
In this paper, we describe this new testbed for developing speech
recognition algorithms and report on the performance of well-known
techniques for building better acoustic models for the speaking
styles seen in this corpus. The best English ASR system to date
has a word error rate of 43.8\% on this corpus.},
PDF = {pubs/icassp03-g001a.pdf}
}
@CONFERENCE{BR_04_Use_Metadata,
AUTHOR = {B. Ramabhadran and O. Siohan and G. Zweig},
TITLE = {Use of Metadata to Improve Recognition of Spontaneous Speech and
Named Entities},
BOOKTITLE = {International Conference on Spoken Language Processing},
YEAR = {2004},
OWNER = {dkw},
TIMESTAMP = {2006.07.17}
}
@TECHREPORT{HGR_05_Errors_ASR,
AUTHOR = {H. Gladfelter Rubin and D. Soergel},
TITLE = {A Typology of Errors in {ASR} Transcriptions of Oral History Interviews},
INSTITUTION = {University of Maryland, College of Information Studies},
YEAR = {2005},
TYPE = {{MALACH} Technical Report},
ADDRESS = {College Park},
MONTH = {August},
OWNER = {dkw},
PDF = {pubs/HGR_05_Errors_ASR.pdf},
TIMESTAMP = {2006.08.02}
}
@INPROCEEDINGS{AS_04_Meas_Conv,
AUTHOR = {A. Sethy and S. Narayanan and B. Ramabhadran},
TITLE = {Measuring Convergence in Language Model Estimation Using Relative
Entropy},
BOOKTITLE = {International Conference on Spoken Language Processing},
YEAR = {2004},
ABSTRACT = {Language models are generally estimated using smoothed counting techniques.
These counting schemes can be viewed as non linear functions operating
on a Bernoulli process which converge asymptotically to the true
density. The rate at which these counting schemes converge to the
true density is constrained by the training data set available and
the nature of the language model (LM) being estimated. In this paper
we look at language model estimates as random variables and present
an efficient relative entropy (R.E) based approach to study their
convergence with increasing training data size. We present experimental
results for language modeling in a generic LVCSR system and a medical
domain dialogue task. We also present an efficient recursive R.E
computation method which can be used as a LM distance measure for
a number of tasks including LM clustering.},
OWNER = {dkw},
PDF = {pubs/AS_04_Meas_Conv.pdf},
TIMESTAMP = {2006.07.17}
}
@INPROCEEDINGS{sethy_asru03,
AUTHOR = {A. Sethy and B. Ramabhadran and S. Narayanan},
TITLE = {Improvements in {ASR} for the {MALACH} Project Using Syllable-Centric
Models},
BOOKTITLE = {Proceedings of the {IEEE} Automatic Speech Recognition and Understanding
Workshop},
YEAR = {2003},
ADDRESS = {St. Thomas},
MONTH = {December},
ABSTRACT = {LVCSR systems have traditionally used phones as the basic acoustic
unit for recognition. Syllable and other longer length units provide
an efficient means for modeling long-term temporal dependencies
in speech that are difficult to capture in a phone based recognition
framework. However, it is well known that longer duration units
suffer from training data sparsity problems since a large number
of units in the lexicon will have little or no acoustic training
data. Previous research has shown that syllable-based modeling provides
improvements over word internal systems, but performance has lagged
behind crossword context-dependent systems. In this paper, we describe
a syllable-centric approach to English LVCSR for the MALACH (Multilingual
Access to Large spoken ArCHives) project. The combined modeling
of syllables and context-dependent phones provides a 0.5% absolute
improvement in recognition accuracy over the state-of-the-art cross
word system for the heavily accented and spontaneous speech seen
in oral history archives. More importantly, we report on the importance
of the improved recognition of names and concepts that is crucial
for subsequent search and retrieval.},
PDF = {pubs/asru03-syl.pdf}
}
@INPROCEEDINGS{IS_04_Task_Specific,
AUTHOR = {I. Shafran and W. Byrne},
TITLE = {Task-Specific Minimum Bayes-risk Decoding Using Learned Edit Distance},
BOOKTITLE = {Proc. of the International Conference on Spoken Language Processing},
YEAR = {2004},
ABSTRACT = {This paper extends the minimum Bayes-risk framework to incorporate
a loss function specific to the task and the ASR system. The errors
are modeled as a noisy channel and the parameters are learned from
the data. The resulting loss function is used in the risk criterion
for decoding. Experiments on a large vocabulary conversational speech
recognition system demonstrate significant gains of about 1% absolute
over MAP hypothesis and about 0.6% absolute over untrained lossfunction.
The approach is general enough to be applicable to other sequence
recognition problems such as in Optical Character Recognition (OCR)
and in analysis of biological sequences.},
OWNER = {dkw},
PDF = {pubs/IS_04_Task-spec_Min_Bayes.pdf},
TIMESTAMP = {2006.07.15}
}
@INPROCEEDINGS{IS_06_Corr_Models,
AUTHOR = {I. Shafran and K. Hall},
TITLE = {Corrective Models for Speech Recognition of Inflected Languages},
BOOKTITLE = {Proc. of the Conference on Empirical Methods in Natural Language
Processing (EMNLP)},
YEAR = {2006},
ADDRESS = {Sydney, Australia},
MONTH = {July},
ABSTRACT = {This paper presents a corrective model for speech recognition of inflected
languages. The model, based on a discriminative framework, incorporates
word ngrams features as well as factored morphological features,
providing error reduction over the model based solely on word n-gram
features. Experiments on a large vocabulary task, namely the Czech
portion of the MALACH corpus, demonstrate performance gain of about
1.1–1.5% absolute in word error rate, wherein rphological features
contribute about a third of the improvement. A simple feature selection
mechanism based on x^2 statistics is shown to be effective in reducing
the number of features by about 70% without any loss in performance,
making it feasible to explore yet larger feature spaces.},
OWNER = {dkw},
PDF = {pubs/IS_06_Corr_Models.pdf},
TIMESTAMP = {2006.07.15}
}
@INPROCEEDINGS{OS_05_Fast_Vocab,
AUTHOR = {O. Siohan and M. Bacchiani},
TITLE = {Fast Vocabulary-Independent Audio Search Using Path-Based Graph Indexing},
BOOKTITLE = {{INTERSPEECH} 2005, Eurospeech},
YEAR = {2005},
OWNER = {dkw},
TIMESTAMP = {2006.07.17}
}
@INPROCEEDINGS{OS_05_Constuct_ensembles,
AUTHOR = {O. Siohan and B. Ramabhadran and B. Kingsbury},
TITLE = {Constructing Ensembles of {ASR} Systems Using Randomized Decision
Trees},
BOOKTITLE = {Proc. of the International Conference on Acoustics, Speech, and Signal
Processing},
YEAR = {2005},
ABSTRACT = {Building multiple automatic speech recognition (ASR) systems and combining
their outputs using voting techniques such as ROVER is an effective
technique for lowering the overall word error rate. A successful
system combination approach requires the construction of multiple
systems with complementary errors, or the combination will not outperform
any of the individual systems. In general, this is achieved empirically,
for example by building systems on different input features. In
this paper, we present a systematic approach for building multiple
ASR systems in which the decision tree statetying procedure that
is used to specify context-dependent acoustic models is randomized.
Experiments carried out on two large vocabulary recognition tasks,
MALACH and DARPA EARS, illustrate the effectiveness of the approach.},
OWNER = {dkw},
PDF = {pubs/OS_05_Constuct_Ensembles.pdf},
TIMESTAMP = {2006.07.17}
}
@INPROCEEDINGS{OS_04_Speech_Recog,
AUTHOR = {O. Siohan and B. Ramabhadran and G. Zweig},
TITLE = {Speech Recognition Error Analysis on the {E}nglish {MALACH} Corpus},
BOOKTITLE = {International Conference on Spoken Language Processing},
YEAR = {2004},
OWNER = {dkw},
TIMESTAMP = {2006.07.17}
}
@MISC{DS_06_Das_Malach_Projekt,
AUTHOR = {D. Soergel},
TITLE = {Das MALACH Projekt. Zugang zu mündlich überlieferter Geschichte durch
verbesserte Methoden im Sprachretrieval. Spracherkennung, Sprachverarbeitung,
Retrievalmethoden, Benutzerschnittstelle
Presentation},
HOWPUBLISHED = {University of Saarbruecken},
MONTH = {February},
YEAR = {2005},
OWNER = {dkw},
TIMESTAMP = {2006.08.01}
}
@TECHREPORT{malach_cistr_2002,
AUTHOR = {D. Soergel and D.W. Oard and S. Gustman and L. Fraser and J. Kim
and J. Meyer and E. Proffen and T. Sartori},
TITLE = {The Many Uses of Digitized Oral History Collections: Implications
for Design},
INSTITUTION = {College of Information Studies, University of Maryland},
YEAR = {2002},
ABSTRACT = {Oral history - and spoken word collections generally - are assuming
increasing importance in digital libraries as the storage, transmission
and reproduction infrastructure improves. This paper describes three
synergistic approaches to user needs analysis, explains how they
are being applied to guide the design of systems to provide access
oral history collections (using as a test bed the Shoah Foundations
collection of over 50,000 videotaped oral history interviews), presents
preliminary results from so-called ``discount requirements analysis''
of a wide variety of users and uses of oral history data and a concomitant
variety of access points that would be useful.},
PDF = {pubs/MALACHTechReportUses.pdf}
}
@INPROCEEDINGS{ST_05_Acoustic_Trng,
AUTHOR = {S. Tsakalidis and W. Byrne},
TITLE = {Acoustic Training from Heterogeneous Data Sources: Experiments in
{M}andarin Conversational Telephone Speech Transcription},
BOOKTITLE = {{IEEE} Conference on Acoustics, Speech and Signal Processing},
YEAR = {2005},
OWNER = {dkw},
PDF = {pubs/ST_05_Acoust_Trng.pdf},
TIMESTAMP = {2006.07.15}
}
@INPROCEEDINGS{VV_05_Lat_Seg,
AUTHOR = {V. Venkataramani and W. Byrne},
TITLE = {Lattice Segmentation and Support Vector Machines for Large Vocabulary
Continuous Speech Recognition},
BOOKTITLE = {{IEEE} Conference on Acoustics, Speech and Signal Processing},
YEAR = {2005},
ADDRESS = {Philadelphia},
ABSTRACT = {Lattice segmentation procedures are used to spot possible recognition
errors in first-pass recognition hypotheses produced by a large
vocabulary continuous speech recognition system. This approach is
analyzed in terms of its ability to reliably identify, and provide
good alternatives for, incorrectly hypothesized words. A procedure
is described to train and apply Support Vector Machines to strengthen
the first pass system where it was found to be weak, resulting in
small but statistically significant recognition improvements on
a large test set of conversational speech.},
OWNER = {dkw},
PDF = {pubs/VV_05_Lat_Seg.pdf},
TIMESTAMP = {2006.07.15}
}
@ARTICLE{VV_06_Gini_support,
AUTHOR = {V. Venkataramani and S. Chakrabartty and W. Byrne},
TITLE = {Gini Support Vector Machines for Segmental Minimum {B}ayes Risk Decoding
of Continuous Speech},
JOURNAL = {Computer Speech and Language},
YEAR = {2006},
NOTE = {Accepted; in revision},
ABSTRACT = {We describe the use of Support Vector Machines (SVMs) for continuous
speech recognition by incorporating them in Segmental Minimum Bayes
Risk decoding. Lattice cutting is used to convert the Automatic
Speech Recognition search space into sequences of smaller recognition
problems. SVMs are then trained as discriminative models over each
of these problems and used in a rescoring framework. We pose the
estimation of a posterior distribution over hypothesis in these
regions of acoustic confusion as a logistic regression problem.
We also show that GiniSVMs can be used as an approximation technique
to estimate the parameters of the logistic regression problem. On
a small vocabulary recognition task we show that the use of GiniSVMs
can improve the performance of a well trained Hidden Markov Model
system trained under the Maximum Mutual Information criterion. We
also find that it is possible to derive reliable confidence scores
over the GiniSVM hypotheses and that these can be used to good effect
in hypothesis combination. We discuss the problems that we expect
to encounter in extending this approach to Large Vocabulary Continuous
Speech Recognition and describe initial investigation of constrained
estimation techniques to derive feature spaces for SVMs.},
OWNER = {dkw},
PDF = {pubs/VV_06_Gini_Support.pdf},
TIMESTAMP = {2006.07.15}
}
@INPROCEEDINGS{asru03_svmsmbr,
AUTHOR = {V. Venkataramani and S. Chakrabartty and W. Byrne},
TITLE = {Support Vector Machines for Segmental Minimum {B}ayes Risk Decoding
of Continuous Speech},
BOOKTITLE = {{IEEE} Automatic Speech Recognition and Understanding Workshop},
YEAR = {2003},
ABSTRACT = {Segmental Minimum Bayes Risk (SMBR) Decoding involves the refinement
of the search space into manageable confusion sets, {\it i.e.,}
smaller sets of confusable words. We describe the application of
Support Vector Machines (SVMs) as discriminative models for the
refined search space. We show that SVMs, which in their basic formulation
are binary classifiers of fixed dimensional observations, can be
used for continuous speech recognition. We also study the use of
$Gini$SVMs, which is a variant of the basic SVM. On a small vocabulary
task, we show this two pass scheme outperforms MMI trained HMMs.
Using system combination we also obtain further improvements over
discriminatively trained HMMs.},
PDF = {pubs/asru03_smbr_svm.pdf}
}
@INPROCEEDINGS{JW_05_CLEF_2005,
AUTHOR = {J. Wang and D.W. Oard},
TITLE = {{CLEF} 2005 {CL-SR} at Maryland: Document and Query Expansion Using
Side Collections and Thesauri},
BOOKTITLE = {Working Notes for the {CLEF-2005} Workshop},
YEAR = {2005},
ADDRESS = {Vienna, Austria},
ABSTRACT = {This paper reports results for the University of Maryland's participation
in CLEF-2005 Cross-Language Speech Retrieval track. Techniques that
were tried include: (1) document expansion with manually created
metadata (thesaurus keywords and segment summaries) from a large
side collection, (2) query refinement with pseudo-relevance feedback,
(3) keyword expansion with thesaurus synonyms, and (4) cross-language
speech retrieval using translation knowledge obtained from the statistics
of a large parallel corpus. The results show that document expansion
and query expansion using blind relevance feedback were effective,
although optimal parameter choices differed somewhat between the
training and evaluation sets. Document expansion in which manually
assigned keywords were augmented with thesaurus synonyms yielded
marginal gains on the training set, but no improvement on the evaluation
set. Cross-language retrieval with French queries yielded 79% of
monolingual mean average precision when searching manually assigned
metadata despite a substantial domain mis-match between the parallel
corpus and the retrieval task. Detailed failure analysis indicates
that speech recognition errors for named entities were an important
factor that substantially degraded retrieval effectiveness.},
KEYWORDS = {Speech Retrieval, Document Expansion, Query Expansion, Blind Relevance
Feedback},
OWNER = {dkw},
PDF = {pubs/JW_05_Query_Expan.pdf},
TIMESTAMP = {2006.08.01}
}
@INPROCEEDINGS{RW_05_chrono_metadata,
AUTHOR = {R.W. White},
TITLE = {Chronological Metadata},
BOOKTITLE = {Proceedings of the 28th Annual International ACM SIGIR Conference
on Research and Development in Information Retrieval},
YEAR = {2005},
OWNER = {dkw},
TIMESTAMP = {2006.07.17}
}
@INPROCEEDINGS{RW_05_CLEF_overview,
AUTHOR = {R.W. White and D.W. Oard and G.J.F. Jones and D. Soergel and X. Huang},
TITLE = {Overview of the {CLEF}-2005 Cross-Language Speech Retrieval Track},
BOOKTITLE = {Cross-Language Evaluation Forum},
YEAR = {2005},
ADDRESS = {Vienna, Austria},
MONTH = {September},
ABSTRACT = {The task for the CLEF-2005 cross-language speech retrieval track was
to identify topically coherent segments of English interviews in
a known-boundary condition. Seven teams participicipated, performing
both monolingual and cross-language searches of ASR transcripts,
automatically generated metadata, and manually generated metadata.
Results indicate that monolingual search technology is sufficiently
accurate to be useful for some purposes (the best mean average precision
was 0.18) and cross-language searching yielded results typical of
those seen in other applications (with the best systems approximating
monolingual mean average precision).},
OWNER = {dkw},
PDF = {pubs/RW_05_CLEF-Overview.pdf},
TIMESTAMP = {2006.07.17}
}
@INPROCEEDINGS{RW_06_ConceptMaps,
AUTHOR = {R.W. White and H. Song and J. Liu},
TITLE = {Concept Maps to Support Oral History Search and Use},
BOOKTITLE = {{JCDL} '06: Proceedings of the 6th ACM/IEEE-CS joint conference on
Digital Libraries},
YEAR = {2006},
PAGES = {192-193},
ADDRESS = {Chapel Hill, NC, USA},
PUBLISHER = {ACM Press, New York, NY, USA},
ABSTRACT = {In this paper we describe a novel technique to support information
v seeking in oral history archives using concept maps. We conducted
a pilot study with teachers engaged in work tasks using a prototype
concept mapping tool. Results suggest that concept maps can help
searchers, especially when tasks are complex.},
DOI = {http://doi.acm.org/10.1145/1141753.1141794},
OWNER = {dkw},
PDF = {pubs/RW_06_ConceptMaps.pdf},
TIMESTAMP = {2006.08.01}
}
@TECHREPORT{PZ_06_Knowl-Based,
AUTHOR = {P. Zhang and D. Soergel},
TITLE = {Knowledge-Based Approaches to the Segmentation of Oral History Interviews},
INSTITUTION = {University of Maryland, College of Information Studies},
YEAR = {2006},
TYPE = {{MALACH} Technical Report},
ADDRESS = {College Park},
MONTH = {May},
ABSTRACT = {This paper applies discourse knowledge to the segmentation of speech
transcripts. The paper reviews literature on discourse structure,
as well as approaches used in text segmentation and speech segmentation,
identifies what features are used and how the features are combined
in these approaches. After reviewing the literature, a three-part
study is conducted to answer the following three research questions:
Are discourse-markers indicators of segment boundaries in oral history
interviews?
Are questions good indicators of segment boundaries? Could questions
be used as segment boundary or segment continuation indicators?
Do the discourse structures proposed by Labov and Waletzky (1967,
1997) and Stein and Glenn (1979) hold for oral history interviews?
How could this knowledge be used in automatic segmentation?
Methodology, results and analysis of each part of the study are described.
Major findings include trends in segmentation and answers to these
questions. Limitation of the study is discussed. The paper also
suggests future research topic relates to segmentation and discourse
analysis.},
OWNER = {dkw},
PDF = {pubs/PZ_06_Knowl-Based.pdf},
TIMESTAMP = {2006.08.02}
}
@TECHREPORT{IS_06_Acoustic,
AUTHOR = {I. Shafran},
TITLE = {Acoustic and Language Modeling for Czech ASR in MALACH},
INSTITUTION = {The Johns Hopkins University, The Center for Language and Speech Processing},
YEAR = {2006},
TYPE = {{CLSP} Research Note},
ADDRESS = {Baltimore, MD},
MONTH = {August},
OWNER = {katyn},
PDF = {pubs/IS_06_Acoustic.pdf},
TIMESTAMP = {2006.09.20}
}
This file has been generated by bibtex2html 1.77