c++-gtk-utils
reassembler.h
Go to the documentation of this file.
1 /* Copyright (C) 2005 to 2010 Chris Vine
2 
3 The library comprised in this file or of which this file is part is
4 distributed by Chris Vine under the GNU Lesser General Public
5 License as follows:
6 
7  This library is free software; you can redistribute it and/or
8  modify it under the terms of the GNU Lesser General Public License
9  as published by the Free Software Foundation; either version 2.1 of
10  the License, or (at your option) any later version.
11 
12  This library is distributed in the hope that it will be useful, but
13  WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  Lesser General Public License, version 2.1, for more details.
16 
17  You should have received a copy of the GNU Lesser General Public
18  License, version 2.1, along with this library (see the file LGPL.TXT
19  which came with this source code package in the c++-gtk-utils
20  sub-directory); if not, write to the Free Software Foundation, Inc.,
21  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
22 
23 */
24 
25 #ifndef CGU_REASSEMBLER_H
26 #define CGU_REASSEMBLER_H
27 
30 
31 namespace Cgu {
32 
33 namespace Utf8 {
34 
35 
36 /**
37  * @class Reassembler reassembler.h c++-gtk-utils/reassembler.h
38  * @brief A class for reassembling UTF-8 strings sent over pipes and
39  * sockets so they form complete valid UTF-8 characters.
40  *
41  * Utf8::Reassembler is a functor class which takes in a partially
42  * formed UTF-8 string and returns a nul-terminated string comprising
43  * such of the input string (after inserting, at the beginning, any
44  * partially formed UTF-8 character which was at the end of the input
45  * string passed in previous calls to the functor) as forms complete
46  * UTF-8 characters (storing any partial character at the end for the
47  * next call to the functor). If the input string contains invalid
48  * UTF-8 after adding any stored previous part character (apart from
49  * any partially formed character at the end of the input string) then
50  * operator() will return a null Cgu::SharedHandle<char*> object (that
51  * is, Cgu::SharedHandle<char*>::get() will return 0). Such input
52  * will not be treated as invalid if it consists only of a single
53  * partly formed UTF-8 character which could be valid if further bytes
54  * were received and added to it. In that case the returned
55  * SharedHandle<char*> object will contain an allocated string of zero
56  * length, comprising only a terminating \0 character, rather than a
57  * NULL pointer.
58  *
59  * This enables UTF-8 strings to be sent over pipes, sockets, etc and
60  * displayed in a GTK+ object at the receiving end
61  *
62  * Note that for efficiency reasons the memory held in the returned
63  * Cgu::SharedHandle<char*> object may be greater than the length of
64  * the nul-terminated string that is contained in that memory: just
65  * let the Cgu::SharedHandle<char*> object manage the memory, and use
66  * the contents like any other nul-terminated string.
67  *
68  * This class is not needed if std::getline(), with its default '\\n'
69  * delimiter, is used to read UTF-8 characters using, say,
70  * Cgu::fdistream, because a whole '\\n' delimited line of UTF-8
71  * characters will always be complete.
72  *
73  * This is an example of its use, reading from a pipe until it is
74  * closed by the writer and putting the received text in a
75  * GtkTextBuffer object:
76  * @code
77  * using namespace Cgu;
78  *
79  * GtkTextIter end;
80  * GtkTextBuffer* text_buffer = gtk_text_view_get_buffer(GTK_TEXT_VIEW(text_view));
81  * gtk_text_buffer_get_end_iter(text_buffer, &end);
82  *
83  * Utf8::Reassembler reassembler;
84  * const int BSIZE = 1024;
85  * char read_buffer[BSIZE];
86  * ssize_t res;
87  * do {
88  * res = ::read(fd, read_buffer, BSIZE);
89  * if (res > 0) {
90  * SharedHandle<char*> utf8(reassembler(read_buffer, res));
91  * if (utf8.get()) {
92  * gtk_text_buffer_insert(text_buffer, &end,
93  * utf8.get(), std::strlen(utf8));
94  * }
95  * else std::cerr << "Invalid utf8 text sent over pipe\n";
96  * }
97  * } while (res && (res != -1 || errno == EINTR));
98  * @endcode
99  */
100 
101 class Reassembler {
102  size_t stored;
103  const static size_t buff_size = 6;
104  char buffer[buff_size];
105  char* join_buffer(const char*, size_t);
106 
107 public:
108 /**
109  * Takes a byte array of wholly or partly formed UTF-8 characters to
110  * be converted (after taking account of previous calls to the method)
111  * to a valid string of wholly formed characters.
112  * @param input The input array.
113  * @param size The number of bytes in the input (not the number of
114  * UTF-8 characters).
115  * @return A Cgu::SharedHandle<char*> object holding a nul-terminated
116  * string comprising such of the input (after inserting, at the
117  * beginning, any partially formed UTF-8 character which was at the
118  * end of the input passed in previous calls to the functor) as forms
119  * complete UTF-8 characters (storing any partial character at the end
120  * for the next call to the functor). If the input is invalid after
121  * such recombination, then a null Cgu::SharedHandle<char*> object is
122  * returned (that is, Cgu::SharedHandle<char*>::get() will return 0).
123  * Such input will not be treated as invalid if it consists only of a
124  * single partly formed UTF-8 character which could be valid if
125  * further bytes were received and added to it. In that case the
126  * returned Cgu::SharedHandle<char*> object will contain an allocated
127  * string of zero length, comprising only a terminating \0 character,
128  * rather than a NULL pointer.
129  * @exception std::bad_alloc The method might throw std::bad_alloc if
130  * memory is exhausted and the system throws in that case. It will
131  * not throw any other exception.
132  */
133  Cgu::SharedHandle<char*> operator()(const char* input, size_t size);
134 
135 /**
136  * Gets the number of bytes of a partially formed UTF-8 character
137  * stored for the next call to operator()(). It will not throw.
138  * @return The number of bytes.
139  */
140  size_t get_stored() const {return stored;}
141 
142 /**
143  * Resets the Reassembler, by discarding any partially formed UTF-8
144  * character from previous calls to operator()(). It will not throw.
145  */
146  void reset() {stored = 0;}
147 
148 /**
149  * The constructor will not throw.
150  */
151  Reassembler(): stored(0), buffer() {}
152 
153 /* Only has effect if --with-glib-memory-slices-compat or
154  * --with-glib-memory-slices-no-compat option picked */
156 };
157 
158 } // namespace Utf8
159 
160 } // namespace Cgu
161 
162 #endif
Cgu
Definition: application.h:45
Cgu::Utf8::Reassembler
A class for reassembling UTF-8 strings sent over pipes and sockets so they form complete valid UTF-8 ...
Definition: reassembler.h:101
Cgu::SharedHandle
This is a generic class for managing the lifetime of objects allocated on freestore.
Definition: shared_handle.h:410
Cgu::Utf8::Reassembler::get_stored
size_t get_stored() const
Definition: reassembler.h:140
CGU_GLIB_MEMORY_SLICES_FUNCS
#define CGU_GLIB_MEMORY_SLICES_FUNCS
Definition: cgu_config.h:84
shared_handle.h
Cgu::Utf8::Reassembler::reset
void reset()
Definition: reassembler.h:146
Cgu::Utf8::Reassembler::Reassembler
Reassembler()
Definition: reassembler.h:151
cgu_config.h
Cgu::Utf8::Reassembler::operator()
Cgu::SharedHandle< char * > operator()(const char *input, size_t size)