SeqAn3 3.3.0-rc.1
The Modern C++ library for sequence analysis.
structure_file/input.hpp
Go to the documentation of this file.
1// -----------------------------------------------------------------------------------------------------
2// Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3// Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4// This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5// shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6// -----------------------------------------------------------------------------------------------------
7
13#pragma once
14
15#include <cassert>
16#include <filesystem>
17#include <fstream>
18#include <limits>
19#include <optional>
20#include <string>
21#include <type_traits>
22#include <utility>
23#include <variant>
24#include <vector>
25
43
44namespace seqan3
45{
46// ----------------------------------------------------------------------------
47// structure_file_input_traits
48// ----------------------------------------------------------------------------
49
141template <typename t>
143 requires (t v) {
144 // TODO(joergi-w) The expensive concept checks are currently omitted. Check again when compiler has improved.
145 // sequence
150
151 // id
154
155 // bpp
156 requires std::is_floating_point_v<typename t::bpp_prob>;
158
159 // requires container // TODO check Associative container Concept when implemented
160 // <typename t::template bpp_queue
161 // <typename t::template bpp_item
162 // <typename t::bpp_prob, typename t::bpp_partner>>>
163 // && requires(typename t::template bpp_queue // TODO maybe implement also a version that allows emplace_back
164 // <typename t::template bpp_item
165 // <typename t::bpp_prob, typename t::bpp_partner>> value) { value.emplace(1.0, 1); };
166 // requires sequence_container
167 // <typename t::template bpp_container
168 // <typename t::template bpp_queue
169 // <typename t::template bpp_item
170 // <typename t::bpp_prob, typename t::bpp_partner>>>>;
171
172 // structure
173 requires std::is_same_v<typename t::structure_alphabet, dssp9> // TODO(joergi-w) add aa_structure_concept
176
177 // structured sequence: tuple composites of seq and structure
178 requires std::is_base_of_v<
179 alphabet_tuple_base<
180 typename t::template structured_seq_alphabet<typename t::seq_alphabet, typename t::structure_alphabet>,
181 typename t::seq_alphabet,
182 typename t::structure_alphabet>,
183 typename t::template structured_seq_alphabet<typename t::seq_alphabet, typename t::structure_alphabet>>;
184 // requires sequence_container
185 // <typename t::template structured_seq_container
186 // <typename t::template structured_seq_alphabet
187 // <typename t::seq_alphabet, typename t::structure_alphabet>>>;
188
189 // energy: std::optional of floating point number
190 requires std::is_floating_point_v<typename t::energy_type::value_type>;
191
192 // reactivity [error]
193 requires std::is_floating_point_v<typename t::react_type>;
195
196 // comment
199
200 // offset
202 };
204
205// ----------------------------------------------------------------------------
206// structure_file_input_default_traits
207// ----------------------------------------------------------------------------
208
223{
229 // sequence
230
233
236
238 template <typename _seq_alphabet>
240
241 // id
242
244 using id_alphabet = char;
245
247 template <typename _id_alphabet>
249
250 // base pair probability structure
251
253 using bpp_prob = double;
254
256 using bpp_partner = size_t;
257
259 template <typename _bpp_prob, typename _bpp_partner>
261
263 template <typename _bpp_item>
265
267 template <typename _bpp_queue>
269
270 // fixed structure
271
273 using structure_alphabet = wuss51;
274
276 template <typename _structure_alphabet>
278
279 // combined sequence and structure
280
282 template <typename _seq_alphabet, typename _structure_alphabet>
284
286 template <typename _structured_seq_alphabet>
288
289 // energy
290
293
294 // reactivity [error]
295
297 using react_type = double;
298
300 template <typename _react_type>
302
303 // comment
304
306 using comment_alphabet = char;
307
309 template <typename _comment_alphabet>
311
312 // offset
313
315 using offset_type = size_t;
317};
318
322{
335 template <typename _seq_alphabet, typename _structure_alphabet>
338};
339
340// ----------------------------------------------------------------------------
341// structure_file_input
342// ----------------------------------------------------------------------------
343
362{
363public:
369 using traits_type = traits_type_;
371 using selected_field_ids = selected_field_ids_;
373 using valid_formats = valid_formats_;
375 using stream_char_type = char;
377
382 field::id,
391
392 static_assert(
393 []() constexpr {
394 for (field f : selected_field_ids::as_array)
395 if (!field_ids::contains(f))
396 return false;
397 return true;
398 }(),
399 "You selected a field that is not valid for structure files, please refer to the documentation "
400 "of structure_file_input::field_ids for the accepted values.");
401
402 static_assert(
403 []() constexpr {
406 }(),
407 "You may not select field::structured_seq and either of field::seq and field::structure "
408 "at the same time.");
409
416 using seq_type = typename traits_type::template seq_container<typename traits_type::seq_alphabet>;
418 using id_type = typename traits_type::template id_container<typename traits_type::id_alphabet>;
420 using bpp_type = typename traits_type::template bpp_container<typename traits_type::template bpp_queue<
421 typename traits_type::template bpp_item<typename traits_type::bpp_prob, typename traits_type::bpp_partner>>>;
423 using structure_type = typename traits_type::template structure_container<typename traits_type::structure_alphabet>;
425 using structured_seq_type = typename traits_type::template structured_seq_container<
426 typename traits_type::template structured_seq_alphabet<typename traits_type::seq_alphabet,
427 typename traits_type::structure_alphabet>>;
429 using energy_type = typename traits_type::energy_type;
431 using react_type = typename traits_type::template react_container<typename traits_type::react_type>;
433 using comment_type = typename traits_type::template comment_container<typename traits_type::comment_alphabet>;
435 using offset_type = typename traits_type::offset_type;
436
439 id_type,
440 bpp_type,
448
453
463 using const_reference = void;
465 using size_type = size_t;
471 using const_iterator = void;
473 using sentinel = std::default_sentinel_t;
475
491
509 selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
511 {
512 primary_stream->rdbuf()->pubsetbuf(stream_buffer.data(), stream_buffer.size());
514 ->open(filename, std::ios_base::in | std::ios::binary);
515
516 if (!primary_stream->good())
517 throw file_open_error{"Could not open file " + filename.string() + " for reading."};
518
519 // possibly add intermediate decompression stream
521
522 // initialise format handler
523 detail::set_format(format, filename);
524 }
525
541 template <input_stream stream_t, structure_file_input_format file_format>
542 requires std::same_as<typename std::remove_reference_t<stream_t>::char_type, char>
543 structure_file_input(stream_t & stream,
544 file_format const & SEQAN3_DOXYGEN_ONLY(format_tag),
545 selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
547 format{detail::structure_file_input_format_exposer<file_format>{}}
548 {
549 static_assert(list_traits::contains<file_format, valid_formats>,
550 "You selected a format that is not in the valid_formats of this file.");
551
552 // possibly add intermediate decompression stream
554 }
555
557 template <input_stream stream_t, structure_file_input_format file_format>
558 requires std::same_as<typename std::remove_reference_t<stream_t>::char_type, char>
559 structure_file_input(stream_t && stream,
560 file_format const & SEQAN3_DOXYGEN_ONLY(format_tag),
561 selected_field_ids const & SEQAN3_DOXYGEN_ONLY(fields_tag) = selected_field_ids{}) :
562 primary_stream{new stream_t{std::move(stream)}, stream_deleter_default},
563 format{detail::structure_file_input_format_exposer<file_format>{}}
564 {
565 static_assert(list_traits::contains<file_format, valid_formats>,
566 "You selected a format that is not in the valid_formats of this file.");
567
568 // possibly add intermediate compression stream
570 }
572
592 {
593 // buffer first record
595 {
598 }
599
600 return {*this};
601 }
602
616 sentinel end() noexcept
617 {
618 return {};
619 }
620
644 reference front() noexcept
645 {
646 return *begin();
647 }
649
651 structure_file_input_options<typename traits_type::seq_legal_alphabet,
654
655protected:
657
667
676 {}
679 {
680 delete ptr;
681 }
682
687
691 bool at_end{false};
692
699
702 {
703 // clear the record
705
706 // store the current position in the position buffer
708
709 // at end if we could not read further
712 {
713 at_end = true;
714 return;
715 }
716
717 assert(!format.valueless_by_exception());
719 [&](auto & f)
720 {
721 // read new record
723 {
725 "You may not select field::structured_seq and field::structure at the same time.");
727 "You may not select field::structured_seq and field::seq at the same time.");
728 f.read_structure_record(*secondary_stream,
729 options,
730 detail::get_or_ignore<field::structured_seq>(record_buffer), // seq
731 detail::get_or_ignore<field::id>(record_buffer),
732 detail::get_or_ignore<field::bpp>(record_buffer),
733 detail::get_or_ignore<field::structured_seq>(record_buffer), // structure
734 detail::get_or_ignore<field::energy>(record_buffer),
735 detail::get_or_ignore<field::react>(record_buffer),
736 detail::get_or_ignore<field::react_err>(record_buffer),
737 detail::get_or_ignore<field::comment>(record_buffer),
738 detail::get_or_ignore<field::offset>(record_buffer));
739 }
740 else
741 {
742 f.read_structure_record(*secondary_stream,
743 options,
744 detail::get_or_ignore<field::seq>(record_buffer),
745 detail::get_or_ignore<field::id>(record_buffer),
746 detail::get_or_ignore<field::bpp>(record_buffer),
747 detail::get_or_ignore<field::structure>(record_buffer),
748 detail::get_or_ignore<field::energy>(record_buffer),
749 detail::get_or_ignore<field::react>(record_buffer),
750 detail::get_or_ignore<field::react_err>(record_buffer),
751 detail::get_or_ignore<field::comment>(record_buffer),
752 detail::get_or_ignore<field::offset>(record_buffer));
753 }
754 },
755 format);
756 }
757
759 friend iterator;
760};
761
768template <input_stream stream_type,
769 structure_file_input_format file_format,
770 detail::fields_specialisation selected_field_ids>
771structure_file_input(stream_type && stream, file_format const &, selected_field_ids const &)
775
777template <input_stream stream_type,
778 structure_file_input_format file_format,
780structure_file_input(stream_type & stream, file_format const &, selected_field_ids const &)
785
786} // namespace seqan3
Provides seqan3::aa27, container aliases and string literals.
Provides alphabet adaptations for standard char types.
The twenty-seven letter amino acid alphabet..
Definition: aa27.hpp:46
Input iterator necessary for providing a range-like interface in input file.
Definition: in_file_iterator.hpp:41
The protein structure alphabet of the characters "HGIEBTSCX"..
Definition: dssp9.hpp:63
The 15 letter RNA alphabet, containing all IUPAC smybols minus the gap..
Definition: rna15.hpp:51
The five letter RNA alphabet of A,C,G,U and the unknown character N..
Definition: rna5.hpp:49
The generic concept for structure file in formats.
Definition: structure_file/input_format_concept.hpp:142
A class for reading structured sequence files, e.g. Stockholm, Connect, Vienna, ViennaRNA bpp matrix ...
Definition: structure_file/input.hpp:362
std::vector< char > stream_buffer
A larger (compared to stl default) stream buffer to use when reading from a file.
Definition: structure_file/input.hpp:663
format_type format
The actual std::variant holding a pointer to the detected/selected format.
Definition: structure_file/input.hpp:697
structure_file_input_options< typename traits_type::seq_legal_alphabet, selected_field_ids::contains(field::structured_seq)> options
The options are public and its members can be set directly.
Definition: structure_file/input.hpp:653
reference front() noexcept
Return the record we are currently at in the file.
Definition: structure_file/input.hpp:644
structure_file_input(structure_file_input const &)=delete
Copy construction is explicitly deleted, because you cannot have multiple access to the same file.
typename traits_type::template id_container< typename traits_type::id_alphabet > id_type
The type of the ID field (default std::string).
Definition: structure_file/input.hpp:418
size_t size_type
An unsigned integer type, usually std::size_t.
Definition: structure_file/input.hpp:465
typename traits_type::template seq_container< typename traits_type::seq_alphabet > seq_type
The type of the sequence field (default std::vector of seqan3::rna5).
Definition: structure_file/input.hpp:416
typename traits_type::template comment_container< typename traits_type::comment_alphabet > comment_type
The type of the comment field (default double).
Definition: structure_file/input.hpp:433
std::default_sentinel_t sentinel
The type returned by end().
Definition: structure_file/input.hpp:473
structure_file_input & operator=(structure_file_input const &)=delete
Copy assignment is explicitly deleted, because you cannot have multiple access to the same file.
structure_file_input(stream_t &stream, file_format const &format_tag, selected_field_ids const &fields_tag=selected_field_ids{})
Construct from an existing stream and with specified format.
Definition: structure_file/input.hpp:543
structure_record< detail::select_types_with_ids_t< field_types, field_ids, selected_field_ids >, selected_field_ids > record_type
The type of the record, a specialisation of seqan3::record; acts as a tuple of the selected field typ...
Definition: structure_file/input.hpp:451
structure_file_input(stream_type &&stream, file_format const &, selected_field_ids const &) -> structure_file_input< typename structure_file_input<>::traits_type, selected_field_ids, type_list< file_format > >
Deduction of the selected fields, the file format and the stream type.
structure_file_input(stream_t &&stream, file_format const &format_tag, selected_field_ids const &fields_tag=selected_field_ids{})
This is an overloaded member function, provided for convenience. It differs from the above function o...
Definition: structure_file/input.hpp:559
char stream_char_type
Character type of the stream(s).
Definition: structure_file/input.hpp:375
void const_reference
The const_reference type is void, because files are not const-iterable.
Definition: structure_file/input.hpp:463
structure_file_input()=delete
Default constructor is explicitly deleted, you need to give a stream or file name.
traits_type_ traits_type
A traits type that defines aliases and template for storage of the fields.
Definition: structure_file/input.hpp:369
typename detail::variant_from_tags< valid_formats, detail::structure_file_input_format_exposer >::type format_type
Type of the format, a std::variant over the valid_formats.
Definition: structure_file/input.hpp:695
bool at_end
File is at position 1 behind the last record.
Definition: structure_file/input.hpp:691
void const_iterator
The const iterator type is void, because files are not const-iterable.
Definition: structure_file/input.hpp:471
selected_field_ids_ selected_field_ids
A seqan3::fields list with the fields selected for the record.
Definition: structure_file/input.hpp:371
structure_file_input(structure_file_input &&)=default
Move construction is defaulted.
structure_file_input(stream_type &stream, file_format const &, selected_field_ids const &) -> structure_file_input< typename structure_file_input<>::traits_type, selected_field_ids, type_list< file_format > >
This is an overloaded member function, provided for convenience. It differs from the above function o...
static void stream_deleter_default(std::basic_istream< stream_char_type > *ptr)
Stream deleter with default behaviour (ownership assumed).
Definition: structure_file/input.hpp:678
typename traits_type::template structure_container< typename traits_type::structure_alphabet > structure_type
The type of the structure field (default std::vector of seqan3::wuss51).
Definition: structure_file/input.hpp:423
friend iterator
Befriend iterator so it can access the buffers.
Definition: structure_file/input.hpp:759
sentinel end() noexcept
Returns a sentinel for comparison with iterator.
Definition: structure_file/input.hpp:616
structure_file_input & operator=(structure_file_input &&)=default
Move assignment is defaulted.
iterator begin()
Returns an iterator to current position in the file.
Definition: structure_file/input.hpp:591
stream_ptr_t primary_stream
The primary stream is the user provided stream or the file stream if constructed from filename.
Definition: structure_file/input.hpp:684
std::streampos position_buffer
Buffer for the previous record position.
Definition: structure_file/input.hpp:665
valid_formats_ valid_formats
A seqan3::type_list with the possible formats.
Definition: structure_file/input.hpp:373
typename traits_type::energy_type energy_type
The type of the energy field (default double).
Definition: structure_file/input.hpp:429
record_type record_buffer
Buffer for a single record.
Definition: structure_file/input.hpp:661
void read_next_record()
Tell the format to move to the next record and update the buffer.
Definition: structure_file/input.hpp:701
typename traits_type::offset_type offset_type
The type of the offset field (default size_t).
Definition: structure_file/input.hpp:435
typename traits_type::template structured_seq_container< typename traits_type::template structured_seq_alphabet< typename traits_type::seq_alphabet, typename traits_type::structure_alphabet > > structured_seq_type
The type of the sequence-structure field (default std::vector of structured_rna<rna5,...
Definition: structure_file/input.hpp:427
typename traits_type::template bpp_container< typename traits_type::template bpp_queue< typename traits_type::template bpp_item< typename traits_type::bpp_prob, typename traits_type::bpp_partner > > > bpp_type
The type of the base pair probabilies (default std::vector of std::set<std::pair<double,...
Definition: structure_file/input.hpp:421
static void stream_deleter_noop(std::basic_istream< stream_char_type > *)
Stream deleter that does nothing (no ownership assumed).
Definition: structure_file/input.hpp:675
structure_file_input(std::filesystem::path filename, selected_field_ids const &fields_tag=selected_field_ids{})
Construct from filename.
Definition: structure_file/input.hpp:508
~structure_file_input()=default
Destructor is defaulted.
typename traits_type::template react_container< typename traits_type::react_type > react_type
The type of the reactivity and reactivity error fields (default double).
Definition: structure_file/input.hpp:431
bool first_record_was_read
Tracks whether the very first record is buffered when calling begin().
Definition: structure_file/input.hpp:689
stream_ptr_t secondary_stream
The secondary stream is a compression layer on the primary or just points to the primary (no compress...
Definition: structure_file/input.hpp:686
A seqan3::alphabet_tuple_base that joins an aminoacid alphabet with a protein structure alphabet....
Definition: structured_aa.hpp:55
A seqan3::alphabet_tuple_base that joins a nucleotide alphabet with an RNA structure alphabet....
Definition: structured_rna.hpp:56
Auxiliary concept that checks whether a type is a specialisation of seqan3::fields.
Definition: detail/record.hpp:35
Auxiliary concept that checks whether a type is a seqan3::type_list and all types meet seqan3::struct...
Definition: structure_file/input_format_concept.hpp:234
T data(T... args)
Provides auxiliary data structures and functions for seqan3::record and seqan3::fields.
Provides the dssp format for protein structure.
Provides the seqan3::format_vienna.
T get(T... args)
field
An enumerator for the fields used in file formats.
Definition: record.hpp:63
void set_format(format_variant_type &format, std::filesystem::path const &file_name)
Sets the file format according to the file name extension.
Definition: io/detail/misc.hpp:68
auto make_secondary_istream(std::basic_istream< char_t > &primary_stream, std::filesystem::path &filename) -> std::unique_ptr< std::basic_istream< char_t >, std::function< void(std::basic_istream< char_t > *)> >
Depending on the magic bytes of the given stream, return a decompression stream or forward the primar...
Definition: misc_input.hpp:80
@ energy
Energy of a folded sequence, represented by one float number.
@ comment
Comment field of arbitrary content, usually a string.
@ structure
Fixed interactions, usually a string of structure alphabet characters.
@ bpp
Base pair probability matrix of interactions, usually a matrix of float numbers.
@ react
Reactivity values of the sequence characters given in a vector of float numbers.
@ react_err
Reactivity error values given in a vector corresponding to seqan3::field::react.
@ offset
Sequence (seqan3::field::seq) relative start position (0-based), unsigned value.
@ structured_seq
Sequence and fixed interactions combined in one range.
@ id
The identifier, usually a string.
@ seq
The "sequence", usually a range of nucleotides or amino acids.
constexpr bool contains
Whether a type occurs in a type list or not.
Definition: type_list/traits.hpp:252
Provides the seqan3::detail::in_file_iterator class template.
Checks whether from can be explicitly converted to to.
A concept that indicates whether an alphabet represents RNA structure.
A more refined container concept than seqan3::container.
The requirements a traits_type for seqan3::structure_file_input must meet.
Refines seqan3::alphabet and adds assignability.
Provides exceptions used in the I/O module.
Stream concepts.
T is_base_of_v
Provides various utility functions required only for input.
The main SeqAn3 namespace.
Definition: aligned_sequence_concept.hpp:29
Provides seqan3::rna15, container aliases and string literals.
Provides seqan3::rna5, container aliases and string literals.
T size(T... args)
Base class to deduce the std::variant type from format tags.
Definition: io/detail/misc.hpp:31
A class template that holds a choice of seqan3::field.
Definition: record.hpp:128
static constexpr bool contains(field f)
Whether a field is contained in the parameter pack.
Definition: record.hpp:149
void clear() noexcept(noexcept(std::apply(expander, std::declval< record & >())))
Clears containers that provide .clear() and (re-)initialises all other elements with = {}.
Definition: record.hpp:237
A traits type that specifies input as amino acids.
Definition: structure_file/input.hpp:322
The default traits for seqan3::structure_file_input.
Definition: structure_file/input.hpp:223
wuss51 structure_alphabet
The alphabet for a structure annotation is seqan3::phred42.
Definition: structure_file/input.hpp:273
size_t offset_type
The type of the offset is size_t.
Definition: structure_file/input.hpp:315
double bpp_prob
The type for a base pair probability is double.
Definition: structure_file/input.hpp:253
char id_alphabet
The alphabet for an identifier string is char.
Definition: structure_file/input.hpp:244
char comment_alphabet
The alphabet for a comment string is char.
Definition: structure_file/input.hpp:306
double react_type
The type of the reactivity and reactivity error is double.
Definition: structure_file/input.hpp:297
size_t bpp_partner
The type for the partner position of a base pair probability is size_t.
Definition: structure_file/input.hpp:256
The options type defines various option members that influence the behaviour of all or some formats.
Definition: structure_file/input_options.hpp:30
Type that contains multiple types.
Definition: type_list.hpp:29
Provides seqan3::structure_file_input_format.
Provides seqan3::structure_file_input_options.
Provides seqan3::structure_record.
Provides the composite of aminoacid with structure alphabets.
Provides traits for seqan3::type_list.
Adaptations of concepts from the standard library.
T visit(T... args)