c++-gtk-utils
convert.h
Go to the documentation of this file.
1 /* Copyright (C) 2005 to 2014 Chris Vine
2 
3 The library comprised in this file or of which this file is part is
4 distributed by Chris Vine under the GNU Lesser General Public
5 License as follows:
6 
7  This library is free software; you can redistribute it and/or
8  modify it under the terms of the GNU Lesser General Public License
9  as published by the Free Software Foundation; either version 2.1 of
10  the License, or (at your option) any later version.
11 
12  This library is distributed in the hope that it will be useful, but
13  WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  Lesser General Public License, version 2.1, for more details.
16 
17  You should have received a copy of the GNU Lesser General Public
18  License, version 2.1, along with this library (see the file LGPL.TXT
19  which came with this source code package in the src/utils sub-directory);
20  if not, write to the Free Software Foundation, Inc.,
21  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
22 
23 However, it is not intended that the object code of a program whose
24 source code instantiates a template from this file or uses macros or
25 inline functions (of any length) should by reason only of that
26 instantiation or use be subject to the restrictions of use in the GNU
27 Lesser General Public License. With that in mind, the words "and
28 macros, inline functions and instantiations of templates (of any
29 length)" shall be treated as substituted for the words "and small
30 macros and small inline functions (ten lines or less in length)" in
31 the fourth paragraph of section 5 of that licence. This does not
32 affect any other reason why object code may be subject to the
33 restrictions in that licence (nor for the avoidance of doubt does it
34 affect the application of section 2 of that licence to modifications
35 of the source code in this file).
36 
37 */
38 
39 #ifndef CGU_CONVERT_H
40 #define CGU_CONVERT_H
41 
42 #include <string>
43 #include <iterator>
44 #include <exception>
45 
46 #include <glib.h>
47 
50 
51 namespace Cgu {
52 
53 /**
54  * @file convert.h
55  * @brief This file contains functions for converting between
56  * character sets.
57  *
58  * \#include <c++-gtk-utils/convert.h>
59  *
60  * This file contains functions for converting between character sets.
61  * If you want these functions to work, you will generally have needed
62  * to have set the locale in the relevant program with either
63  * <em>std::locale::global(std::locale(""))</em> (from the C++
64  * standard library) or <em>setlocale(LC_ALL,"")</em> (from the C
65  * standard library).
66  */
67 
68 /**
69  * @namespace Cgu::Utf8
70  * @brief This namespace contains utilities relevant to the use of
71  * UTF-8 in programs.
72  *
73  * \#include <c++-gtk-utils/convert.h> (for conversion and validation
74  * functions)
75  *
76  * \#include <c++-gtk-utils/reassembler.h> (for Reassembler class)
77  * @sa convert.h reassembler.h
78  *
79  * This namespace contains utilities relevant to the use of UTF-8 in
80  * programs. If you want these functions to work, you will generally
81  * have needed to have set the locale in the relevant program with
82  * either <em>std::locale::global(std::locale(""))</em> (from the C++
83  * standard library) or <em>setlocale(LC_ALL,"")</em> (from the C standard
84  * library).
85  */
86 
87 namespace Utf8 {
88 
89 class ConversionError: public std::exception {
90  GcharSharedHandle message;
91 public:
92  virtual const char* what() const throw() {return (const char*)message.get();}
93  ConversionError(const char* msg):
94  message(g_strdup_printf("Utf8::ConversionError: %s", msg)) {}
95  ConversionError(GError* error):
96  message(g_strdup_printf("Utf8::ConversionError: %s", error->message)) {}
97  ~ConversionError() throw() {}
98 };
99 
100 /**
101  * Converts text from UTF-8 to the system's Unicode wide character
102  * representation, which will be UTF-32/UCS-4 for systems with a wide
103  * character size of 4 (almost all unix-like systems), and UTF-16 for
104  * systems with a wide character size of 2.
105  * @param input Text in valid UTF-8 format.
106  * @return The input text converted to UTF-32 or UTF-16.
107  * @exception Cgu::Utf8::ConversionError This exception will be thrown
108  * if conversion fails because the input string is not in valid UTF-8
109  * format or the system does not support wide character Unicode
110  * strings.
111  * @exception std::bad_alloc This function might throw std::bad_alloc
112  * if memory is exhausted and the system throws in that case.
113  */
114 std::wstring uniwide_from_utf8(const std::string& input);
115 
116 /**
117  * Converts text from the system's Unicode wide character
118  * representation, which will be UTF-32/UCS-4 for systems with a wide
119  * character size of 4 (almost all unix-like systems) and UTF-16 for
120  * systems with a wide character size of 2, to narrow character UTF-8
121  * format.
122  * @param input Text in valid UTF-32 or UTF-16 format.
123  * @return The input text converted to UTF-8.
124  * @exception Cgu::Utf8::ConversionError This exception will be thrown
125  * if conversion fails because the input string is not in valid
126  * UTF-32/UCS-4 or UTF-16 format or the system does not support wide
127  * character Unicode strings.
128  * @exception std::bad_alloc This function might throw std::bad_alloc
129  * if memory is exhausted and the system throws in that case.
130  */
131 std::string uniwide_to_utf8(const std::wstring& input);
132 
133 /**
134  * Converts text from UTF-8 to UTF-32/USC-4.
135  * @param input Text in valid UTF-8 format.
136  * @return The input text converted to UTF-32.
137  * @exception Cgu::Utf8::ConversionError This exception will be thrown
138  * if conversion fails because the input string is not in valid UTF-8
139  * format or the system does not support wide character Unicode
140  * strings.
141  * @exception std::bad_alloc This function might throw std::bad_alloc
142  * if memory is exhausted and the system throws in that case.
143  */
144 std::u32string utf32_from_utf8(const std::string& input);
145 
146 /**
147  * Converts text from UTF-32/UCS4 to narrow character UTF-8 format.
148  * @param input Text in valid UTF-32 format.
149  * @return The input text converted to UTF-8.
150  * @exception Cgu::Utf8::ConversionError This exception will be thrown
151  * if conversion fails because the input string is not in valid
152  * UTF-32/UCS-4 format or the system does not support wide character
153  * Unicode strings.
154  * @exception std::bad_alloc This function might throw std::bad_alloc
155  * if memory is exhausted and the system throws in that case.
156  */
157 std::string utf32_to_utf8(const std::u32string& input);
158 
159 /**
160  * Converts text from UTF-8 to UTF-16.
161  * @param input Text in valid UTF-8 format.
162  * @return The input text converted to UTF-16.
163  * @exception Cgu::Utf8::ConversionError This exception will be thrown
164  * if conversion fails because the input string is not in valid UTF-8
165  * format or the system does not support wide character Unicode
166  * strings.
167  * @exception std::bad_alloc This function might throw std::bad_alloc
168  * if memory is exhausted and the system throws in that case.
169  */
170 std::u16string utf16_from_utf8(const std::string& input);
171 
172 /**
173  * Converts text from UFF-16 to narrow character UTF-8 format.
174  * @param input Text in valid UTF-16 format.
175  * @return The input text converted to UTF-8.
176  * @exception Cgu::Utf8::ConversionError This exception will be thrown
177  * if conversion fails because the input string is not in valid UTF-16
178  * format or the system does not support wide character Unicode
179  * strings.
180  * @exception std::bad_alloc This function might throw std::bad_alloc
181  * if memory is exhausted and the system throws in that case.
182  */
183 std::string utf16_to_utf8(const std::u16string& input);
184 
185 /**
186  * Converts text from UTF-8 to the system's wide character locale
187  * representation. For this function to work correctly, the system's
188  * installed iconv() must support conversion to a generic wchar_t
189  * target, but in POSIX whether it does so is implementation defined
190  * (GNU's C library implemention does). For most unix-like systems
191  * the wide character representation will be Unicode (UCS-4/UTF-32 or
192  * UTF-16), and where that is the case use the uniwide_from_utf8()
193  * function instead, which will not rely on the generic target being
194  * available.
195  * @param input Text in valid UTF-8 format.
196  * @return The input text converted to the system's wide character
197  * locale representation.
198  * @exception Cgu::Utf8::ConversionError This exception will be thrown
199  * if conversion fails because the input string is not in valid UTF-8
200  * format, or cannot be converted to the system's wide character
201  * locale representation (eg because the input characters cannot be
202  * represented by that encoding, or the system's installed iconv()
203  * function does not support conversion to a generic wchar_t target).
204  * @exception std::bad_alloc This function might throw std::bad_alloc
205  * if memory is exhausted and the system throws in that case.
206  */
207 
208 std::wstring wide_from_utf8(const std::string& input);
209 
210 /**
211  * Converts text from the system's wide character locale
212  * representation to UTF-8. For this function to work correctly, the
213  * system's installed iconv() must support conversion from a generic
214  * wchar_t target, but in POSIX whether it does so is implementation
215  * defined (GNU's C library implemention does). For most unix-like
216  * systems the wide character representation will be Unicode
217  * (UCS-4/UTF-32 or UTF-16), and where that is the case use the
218  * uniwide_to_utf8() function instead, which will not rely on the
219  * generic target being available.
220  * @param input Text in a valid wide character locale format.
221  * @return The input text converted to UTF-8.
222  * @exception Cgu::Utf8::ConversionError This exception will be thrown
223  * if conversion fails because the input string is not in a valid wide
224  * character locale format, or cannot be converted to UTF-8 (eg
225  * because the system's installed iconv() function does not support
226  * conversion from a generic wchar_t target).
227  * @exception std::bad_alloc This function might throw std::bad_alloc
228  * if memory is exhausted and the system throws in that case.
229  */
230 std::string wide_to_utf8(const std::wstring& input);
231 
232 /**
233  * Converts text from UTF-8 to the system's filename encoding.
234  * @param input Text in valid UTF-8 format.
235  * @return The input text converted to filename encoding.
236  * @exception Cgu::Utf8::ConversionError This exception will be thrown
237  * if conversion fails because the input string is not in valid UTF-8
238  * format, or cannot be converted to filename encoding (eg because the
239  * input characters cannot be represented by that encoding).
240  * @exception std::bad_alloc This function might throw std::bad_alloc
241  * if memory is exhausted and the system throws in that case.
242  * @note glib takes the system's filename encoding from the
243  * environmental variables G_FILENAME_ENCODING and G_BROKEN_FILENAMES.
244  * If G_BROKEN_FILENAMES is set to 1 and G_FILENAME_ENCODING is not
245  * set, it will be assumed that the filename encoding is the same as
246  * the locale encoding. If G_FILENAME_ENCODING is set, then
247  * G_BROKEN_FILENAMES is ignored, and filename encoding is taken from
248  * the value held by G_FILENAME_ENCODING.
249  */
250 std::string filename_from_utf8(const std::string& input);
251 
252 /**
253  * Converts text from the system's filename encoding to UTF-8.
254  * @param input Text in valid filename encoding.
255  * @return The input text converted to UTF-8.
256  * @exception Cgu::Utf8::ConversionError This exception will be thrown
257  * if conversion fails because the input string is not in valid
258  * filename encoding.
259  * @exception std::bad_alloc This function might throw std::bad_alloc
260  * if memory is exhausted and the system throws in that case.
261  * @note glib takes the system's filename encoding from the
262  * environmental variables G_FILENAME_ENCODING and G_BROKEN_FILENAMES.
263  * If G_BROKEN_FILENAMES is set to 1 and G_FILENAME_ENCODING is not
264  * set, it will be assumed that the filename encoding is the same as
265  * the locale encoding. If G_FILENAME_ENCODING is set, then
266  * G_BROKEN_FILENAMES is ignored, and filename encoding is taken from
267  * the value held by G_FILENAME_ENCODING.
268  */
269 std::string filename_to_utf8(const std::string& input);
270 
271 /**
272  * Converts text from UTF-8 to the system's locale encoding.
273  * @param input Text in valid UTF-8 format.
274  * @return The input text converted to locale encoding.
275  * @exception Cgu::Utf8::ConversionError This exception will be thrown
276  * if conversion fails because the input string is not in valid UTF-8
277  * format, or cannot be converted to locale encoding (eg because the
278  * input characters cannot be represented by that encoding).
279  * @exception std::bad_alloc This function might throw std::bad_alloc
280  * if memory is exhausted and the system throws in that case.
281  */
282 std::string locale_from_utf8(const std::string& input);
283 
284 /**
285  * Converts text from the system's locale encoding to UTF-8.
286  * @param input Text in valid locale encoding.
287  * @return The input text converted to UTF-8.
288  * @exception Cgu::Utf8::ConversionError This exception will be thrown
289  * if conversion fails because the input string is not in valid locale
290  * encoding.
291  * @exception std::bad_alloc This function might throw std::bad_alloc
292  * if memory is exhausted and the system throws in that case.
293  */
294 std::string locale_to_utf8(const std::string& input);
295 
296 /**
297  * Indicates whether the input text comprises valid UTF-8.
298  * @param text The text to be tested.
299  * @return true if the input text is in valid UTF-8 format, otherwise
300  * false.
301  * @exception std::bad_alloc This function might throw std::bad_alloc
302  * if std::string::data() might throw when memory is exhausted.
303  * @note \#include <c++-gtk-utils/convert.h> for this function.
304  */
305 inline bool validate(const std::string& text) {
306  return g_utf8_validate(text.data(), text.size(), 0);
307 }
308 
309 /************** Iterator class **************/
310 
311 /**
312  * @class Iterator convert.h c++-gtk-utils/convert.h
313  * @brief A class which will iterate through a std::string object by
314  * reference to unicode characters rather than by bytes.
315  * @sa Cgu::Utf8::ReverseIterator
316  *
317  * The Cgu::Utf8::Iterator class does the same as
318  * std::string::const_iterator, except that when iterating through a
319  * std::string object using the ++ and - - postfix and prefix
320  * operators, it iterates by increments of whole unicode code points
321  * rather than by reference to bytes. In addition, the dereferencing
322  * operator returns the whole unicode code point (a UCS-4 gunichar
323  * type) rather than a char type.
324  *
325  * Where, as in practically all unix-like systems, sizeof(wchar_t) ==
326  * 4, then the gunichar return value of the dereferencing operator can
327  * be converted by a simple static_cast to the wchar_t type. So far
328  * as displaying individual code points is concerned however, it
329  * should be noted that because unicode allows combining characters, a
330  * unicode code point may not contain the whole representation of a
331  * character as displayed. This effect can be dealt with for all
332  * characters capable of representation by Level 1 unicode (ie by
333  * precomposed characters) using g_utf8_normalize() before iterating.
334  * There will still however be some non-European scripts, in
335  * particular some Chinese/Japanese/Korean ideograms, where
336  * description of the ideogram requires more than one code point to be
337  * finally resolved. For these, printing individual code points
338  * sequentially one by one directly to a display (say with std::wcout)
339  * may or not may not have the desired result, depending on how the
340  * display device (eg console) deals with that case.
341  *
342  * A Cgu::Utf8::Iterator only allows reading from and not writing to
343  * the std::string object being iterated through. This is because in
344  * UTF-8 the representation of any one unicode code point will require
345  * between 1 and 6 bytes: accordingly modifying a UTF-8 string may
346  * change its length (in bytes) even though the number of unicode
347  * characters stays the same. For the same reason, this iterator is a
348  * bidirectional iterator but not a random access iterator.
349  *
350  * The std::string object concerned should contain valid UTF-8 text.
351  * If necessary, this should be checked with Cgu::Utf8::validate()
352  * first. In addition, before use, the Cgu::Utf8::Iterator object
353  * must be initialized by a std::string::const_iterator or
354  * std::string::iterator object pointing to the first byte of a valid
355  * UTF-8 character in the string (or by another Cgu::Utf8::Iterator
356  * object or by a Cgu::Utf8::ReverseIterator object), and iteration
357  * will begin at the point of initialization: therefore, assuming the
358  * string contains valid UTF-8 text, passing std::string::begin() to a
359  * Cgu::Utf8::Iterator object will always be safe. Initialization by
360  * std::string::end() is also valid if the first iteration is
361  * backwards with the \-- operator. This initialization can be done
362  * either in the constructor or by assignment. Comparison operators
363  * ==, !=, <, <=, > and >= are provided enabling the position of
364  * Cgu::Utf8::Iterator objects to be compared with each other or with
365  * std::string::const_iterator and std::string::iterator objects.
366  *
367  * This is an example:
368  * @code
369  * using namespace Cgu;
370  *
371  * std::wstring wide_str(L"ßøǿón");
372  * std::string narrow_str(Utf8::uniwide_to_utf8(wide_str));
373  *
374  * Utf8::Iterator iter;
375  * for (iter = narrow_str.begin();
376  * iter != narrow_str.end();
377  * ++iter)
378  * std::wcout << static_cast<wchar_t>(*iter) << std::endl;
379  * @endcode
380  *
381  * This class assumes in using g_utf8_next_char(), g_utf8_prev_char()
382  * and g_utf8_get_char() that the std::string object keeps its
383  * internal string in contiguous storage. This is required by the
384  * C++11/14 standard, but not formally by C++98/C++03. However, known
385  * implementations of std::string in fact store the string
386  * contiguously.
387  */
388 
389 class ReverseIterator;
390 
391 class Iterator {
392 public:
393  typedef gunichar value_type;
394  typedef gunichar reference; // read only
395  typedef void pointer; // read only
396  typedef std::string::difference_type difference_type;
397  typedef std::bidirectional_iterator_tag iterator_category;
398 
399 private:
400  std::string::const_iterator pos;
401 public:
402 
403 /**
404  * Increments the iterator so that it moves from the beginning of the
405  * current UTF-8 character to the beginning of the next UTF-8
406  * character. It is a prefix operator. It will not throw.
407  * @return A reference to the iterator in its new position.
408  */
409  Iterator& operator++();
410 
411 /**
412  * Increments the iterator so that it moves from the beginning of the
413  * current UTF-8 character to the beginning of the next UTF-8
414  * character. It is a postfix operator. It will not throw provided
415  * that copy constructing and assigning a std::string::const_iterator
416  * object does not throw, as it will not in any sane implementation.
417  * @return A copy of the iterator in its former position.
418  */
419  Iterator operator++(int);
420 
421 /**
422  * Decrements the iterator so that it moves from the beginning of the
423  * current UTF-8 character to the beginning of the previous UTF-8
424  * character. It is a prefix operator. It will not throw.
425  * @return A reference to the iterator in its new position.
426  */
427  Iterator& operator--();
428 
429 /**
430  * Decrements the iterator so that it moves from the beginning of the
431  * current UTF-8 character to the beginning of the previous UTF-8
432  * character. It is a postfix operator. It will not throw provided
433  * that copy constructing and assigning a std::string::const_iterator
434  * object does not throw, as it will not in any sane implementation.
435  * @return A copy of the iterator in its former position.
436  */
437  Iterator operator--(int);
438 
439 /**
440  * Assigns a std::string::const_iterator object to this object. It
441  * should point to the beginning of a UTF-8 character (eg
442  * std::string::begin()) or to std::string::end(). It will not throw
443  * provided assigning a std::string::const_iterator object does not
444  * throw, as it will not in any sane implementation.
445  * @param iter The std::string::const_iterator.
446  * @return A reference to this Cgu::Utf8::Iterator object after
447  * assignment.
448  */
449  Iterator& operator=(const std::string::const_iterator& iter) {pos = iter; return *this;}
450 
451 /**
452  * Assigns a std::string::iterator object to this object. It should
453  * point to the beginning of a UTF-8 character (eg
454  * std::string::begin()) or to std::string::end(). It will not throw
455  * provided assigning a std::string::const_iterator object does not
456  * throw, as it will not in any sane implementation.
457  * @param iter The std::string::iterator.
458  * @return A reference to this Cgu::Utf8::Iterator object after
459  * assignment.
460  */
461  Iterator& operator=(const std::string::iterator& iter) {pos = iter; return *this;}
462 
463 /**
464  * Assigns a Cgu::Utf8::Iterator object to this object. It will not
465  * throw provided assigning a std::string::const_iterator object does
466  * not throw, as it will not in any sane implementation.
467  * @param iter The iterator.
468  * @return A reference to this Cgu::Utf8::Iterator object after
469  * assignment.
470  */
471  Iterator& operator=(const Iterator& iter) {pos = iter.pos; return *this;}
472 
473 /**
474  * Assigns a Cgu::Utf8::ReverseIterator object to this object, so that
475  * this iterator adopts the same physical position (but the logical
476  * position will be offset to the following UTF-8 character). It will
477  * not throw provided assigning a std::string::const_iterator object
478  * does not throw, as it will not in any sane implementation.
479  * @param iter The iterator.
480  * @return A reference to this Cgu::Utf8::Iterator object after
481  * assignment.
482  */
483  Iterator& operator=(const ReverseIterator& iter);
484 
485 /**
486  * The dereference operator.
487  * @return A 32-bit gunichar object containing the whole unicode code
488  * point which is currently represented by this iterator. It will not
489  * throw.
490  */
491  Iterator::value_type operator*() const {return g_utf8_get_char(&(*pos));}
492 
493 /**
494  * @return The current underlying std::string::const_iterator kept by
495  * this iterator. Once this iterator has been correctly initialized,
496  * that will point to the beginning of the UTF-8 character currently
497  * represented by this iterator or to std::string::end(). It will not
498  * throw provided assigning a std::string::const_iterator object does
499  * not throw, as it will not in any sane implementation.
500  */
501  std::string::const_iterator base() const {return pos;}
502 
503 /**
504  * Constructs this iterator and initialises it with a
505  * std::string::const_iterator object. It should point to the
506  * beginning of a UTF-8 character (eg std::string::begin()) or to
507  * std::string::end(). It will not throw provided that copy
508  * constructing a std::string::const_iterator object does not throw,
509  * as it will not in any sane implementation. This is a type
510  * conversion constructor (it is not marked explicit) so that it can
511  * be used with Cgu::Utf8::Iterator comparison operators to compare
512  * the position of Cgu::Utf8::Iterator with
513  * std::string::const_iterator objects.
514  * @param iter The std::string::const_iterator.
515  */
516  Iterator(const std::string::const_iterator& iter): pos(iter) {}
517 
518 /**
519  * Constructs this iterator and initialises it with a
520  * std::string::iterator object. It should point to the beginning of
521  * a UTF-8 character (eg std::string::begin()) or to
522  * std::string::end(). It will not throw provided that copy
523  * constructing a std::string::const_iterator object does not throw,
524  * as it will not in any sane implementation. This is a type
525  * conversion constructor (it is not marked explicit) so that it can
526  * be used with Cgu::Utf8::Iterator comparison operators to compare
527  * the position of Cgu::Utf8::Iterator with std::string::iterator
528  * objects.
529  * @param iter The std::string::iterator.
530  */
531  Iterator(const std::string::iterator& iter): pos(iter) {}
532 
533 /**
534  * Constructs this iterator and initialises it with another
535  * Cgu::Utf8::Iterator object. It will not throw provided that copy
536  * constructing a std::string::const_iterator object does not throw,
537  * as it will not in any sane implementation.
538  * @param iter The iterator.
539  */
540  Iterator(const Iterator& iter): pos(iter.pos) {}
541 
542 /**
543  * Constructs this iterator and initialises it with a
544  * Cgu::Utf8::ReverseIterator object, so that this iterator adopts the
545  * same physical position (but the logical position will be offset to
546  * the following UTF-8 character). It will not throw provided that
547  * copy constructing a std::string::const_iterator object does not
548  * throw, as it will not in any sane implementation.
549  * @param iter The iterator.
550  */
551  explicit Iterator(const ReverseIterator& iter);
552 
553 /**
554  * The default constructor will not throw.
555  */
556  Iterator() {}
557 
558 /* Only has effect if --with-glib-memory-slices-compat or
559  * --with-glib-memory-slices-no-compat option picked */
561 };
562 
564  const std::string::value_type* tmp = &(*pos);
565  // using g_utf8_next_char is safe even when pos points to the last character -
566  // that macro calls up the g_utf8_skip look-up table rather than attempting to
567  // read the following character, so we can safely iterate to std::string::end()
568  pos += g_utf8_next_char(tmp) - tmp;
569  return *this;
570 }
571 
573  Iterator tmp{*this};
574  ++(*this);
575  return tmp;
576 }
577 
579  // we might be iterating from std::string::end() so we need
580  // to decrement before dereferencing and then increment again
581  const std::string::value_type* tmp = &(*(pos-1));
582  ++tmp;
583  pos -= tmp - g_utf8_prev_char(tmp);
584  return *this;
585 }
586 
588  Iterator tmp{*this};
589  --(*this);
590  return tmp;
591 }
592 
593 /**
594  * The comparison operators will not throw provided assigning a
595  * std::string::const_iterator object does not throw, as it will not
596  * in any sane implementation.
597  */
598 inline bool operator==(const Iterator& iter1, const Iterator& iter2) {
599  return (iter1.base() == iter2.base());
600 }
601 
602 /**
603  * The comparison operators will not throw provided assigning a
604  * std::string::const_iterator object does not throw, as it will not
605  * in any sane implementation.
606  */
607 inline bool operator!=(const Iterator& iter1, const Iterator& iter2) {
608  return (iter1.base() != iter2.base());
609 }
610 
611 /**
612  * The comparison operators will not throw provided assigning a
613  * std::string::const_iterator object does not throw, as it will not
614  * in any sane implementation.
615  */
616 inline bool operator<(const Iterator& iter1, const Iterator& iter2) {
617  return (iter1.base() < iter2.base());
618 }
619 
620 /**
621  * The comparison operators will not throw provided assigning a
622  * std::string::const_iterator object does not throw, as it will not
623  * in any sane implementation.
624  */
625 inline bool operator<=(const Iterator& iter1, const Iterator& iter2) {
626  return (iter1.base() <= iter2.base());
627 }
628 
629 /**
630  * The comparison operators will not throw provided assigning a
631  * std::string::const_iterator object does not throw, as it will not
632  * in any sane implementation.
633  */
634 inline bool operator>(const Iterator& iter1, const Iterator& iter2) {
635  return (iter1.base() > iter2.base());
636 }
637 
638 /**
639  * The comparison operators will not throw provided assigning a
640  * std::string::const_iterator object does not throw, as it will not
641  * in any sane implementation.
642  */
643 inline bool operator>=(const Iterator& iter1, const Iterator& iter2) {
644  return (iter1.base() >= iter2.base());
645 }
646 
647 /************** ReverseIterator class **************/
648 
649 /**
650  * @class ReverseIterator convert.h c++-gtk-utils/convert.h
651  * @brief A class which will iterate in reverse through a std::string
652  * object by reference to unicode characters rather than by bytes.
653  * @sa Cgu::Utf8::Iterator
654  *
655  * The Cgu::Utf8::ReverseIterator class does the same as
656  * std::string::const_reverse_iterator, except that when iterating
657  * through a std::string object using the ++ and - - postfix and
658  * prefix operators, it iterates by increments of whole unicode code
659  * points rather than by reference to bytes. In addition, the
660  * dereferencing operator returns the whole unicode code point (a
661  * UCS-4 gunichar type) rather than a char type.
662  *
663  * Before use, the Cgu::Utf8::ReverseIterator object must be
664  * initialized by a std::string::const_reverse_iterator or
665  * std::string::reverse_iterator object representing the first byte of
666  * a valid UTF-8 character in the string (or by another
667  * Cgu::Utf8::ReverseIterator object or by a Cgu::Utf8::Iterator
668  * object): so assuming the string contains valid UTF-8 text, it is
669  * always valid to initialise a Cgu::Utf8::ReverseIterator with
670  * std::string::rbegin(). Initialization by std::string::rend() is
671  * also valid if the first interation is backwards with the \--
672  * operator. This initialization can be done either in the
673  * constructor or by assignment. Comparison operators ==, !=, <, <=,
674  * > and >= are provided enabling the position of
675  * Cgu::Utf8::ReverseIterator objects to be compared with each other
676  * or with std::string::const_reverse_iterator and
677  * std::string::reverse_iterator objects.
678  *
679  * This is an example:
680  * @code
681  * using namespace Cgu;
682  *
683  * std::wstring wide_str(L"ßøǿón");
684  * std::string narrow_str(Utf8::uniwide_to_utf8(wide_str));
685  *
686  * Utf8::ReverseIterator iter;
687  * for (iter = narrow_str.rbegin();
688  * iter != narrow_str.rend();
689  * ++iter)
690  * std::wcout << static_cast<wchar_t>(*iter) << std::endl;
691  * @endcode
692  *
693  * For further information on its use, see the Utf8::Iterator
694  * documentation.
695  */
696 
698 public:
699  typedef gunichar value_type;
700  typedef gunichar reference; // read only
701  typedef void pointer; // read only
702  typedef std::string::difference_type difference_type;
703  typedef std::bidirectional_iterator_tag iterator_category;
704 
705 private:
706  std::string::const_iterator pos;
707  // we use cache to make iterating and then dereferencing more efficient
708  mutable std::string::const_iterator cache;
709 public:
710 
711 /**
712  * Increments the iterator in the reverse direction so that it moves
713  * from the beginning of the current UTF-8 character to the beginning
714  * of the previous UTF-8 character in the std::string object
715  * concerned. It is a prefix operator. It will not throw provided
716  * assigning a std::string::const_iterator object does not throw, as
717  * it will not in any sane implementation.
718  * @return A reference to the iterator in its new position
719  */
721 
722 /**
723  * Increments the iterator in the reverse direction so that it moves
724  * from the beginning of the current UTF-8 character to the beginning
725  * of the previous UTF-8 character in the std::string object
726  * concerned. It is a postfix operator. It will not throw provided
727  * that copy constructing and assigning a std::string::const_iterator
728  * object does not throw, as it will not in any sane implementation.
729  * @return A copy of the iterator in its former position
730  */
732 
733 /**
734  * Decrements the iterator in the reverse direction so that it moves
735  * from the beginning of the current UTF-8 character to the beginning
736  * of the following UTF-8 character in the std::string object
737  * concerned. It is a prefix operator. It will not throw provided
738  * assigning a std::string::const_iterator object does not throw, as
739  * it will not in any sane implementation.
740  * @return A reference to the iterator in its new position
741  */
743 
744 /**
745  * Decrements the iterator in the reverse direction so that it moves
746  * from the beginning of the current UTF-8 character to the beginning
747  * of the following UTF-8 character in the std::string object
748  * concerned. It is a postfix operator. It will not throw provided
749  * that copy constructing and assigning a std::string::const_iterator
750  * object does not throw, as it will not in any sane implementation.
751  * @return A copy of the iterator in its former position
752  */
754 
755 /**
756  * Assigns a std::string::const_reverse_iterator object to this
757  * object. It should represent the beginning of a UTF-8 character (eg
758  * std::string::rbegin()) or comprise std::string::rend(). It will
759  * not throw provided assigning a std::string::const_iterator object
760  * does not throw, as it will not in any sane implementation.
761  * @param iter The const_reverse_iterator.
762  * @return A reference to this Cgu::Utf8::ReverseIterator object after
763  * assignment.
764  */
765  ReverseIterator& operator=(const std::string::const_reverse_iterator& iter) {pos = iter.base(); cache = pos; return *this;}
766 
767 /**
768  * Assigns a std::string::reverse_iterator object to this object. It
769  * should represent the beginning of a UTF-8 character (eg
770  * std::string::rbegin()) or comprise std::string::rend(). It will
771  * not throw provided assigning a std::string::const_iterator object
772  * does not throw, as it will not in any sane implementation.
773  * @param iter The reverse_iterator.
774  * @return A reference to this Cgu::Utf8::ReverseIterator object after
775  * assignment.
776  */
777  ReverseIterator& operator=(const std::string::reverse_iterator& iter) {pos = iter.base(); cache = pos; return *this;}
778 
779 /**
780  * Assigns a Cgu::Utf8::ReverseIterator object to this object. It
781  * will not throw provided assigning a std::string::const_iterator
782  * object does not throw, as it will not in any sane implementation.
783  * @param iter The iterator.
784  * @return A reference to this Cgu::Utf8::ReverseIterator object after
785  * assignment.
786  */
787  ReverseIterator& operator=(const ReverseIterator& iter) {pos = iter.pos; cache = iter.cache; return *this;}
788 
789 /**
790  * Assigns a Cgu::Utf8::Iterator object to this object, so that this
791  * iterator adopts the same physical position (but the logical
792  * position will be offset to the previous UTF-8 character in the
793  * std::string object concerned). It will not throw provided
794  * assigning a std::string::const_iterator object does not throw, as
795  * it will not in any sane implementation.
796  * @param iter The iterator.
797  * @return A reference to this Cgu::Utf8::ReverseIterator object after
798  * assignment.
799  */
800  ReverseIterator& operator=(const Iterator& iter) {pos = iter.base(); cache = pos; return *this;}
801 
802 /**
803  * The dereference operator. Note that although this method is const,
804  * it is not thread safe for concurrent reads without external
805  * synchronization because it writes to an internal cache.
806  * @return A 32-bit gunichar object containing the whole unicode code
807  * point which is currently represented by this iterator. It will not
808  * throw.
809  */
811 
812 /**
813  * @return The current underlying std::string::const_iterator kept by
814  * this iterator. Once this iterator has been correctly initialized,
815  * that will point to the beginning of the UTF-8 character after the
816  * one currently represented by this iterator or to
817  * std::string::end(). It will not throw provided assigning a
818  * std::string::const_iterator object does not throw, as it will not
819  * in any sane implementation.
820  */
821  std::string::const_iterator base() const {return pos;}
822 
823 /**
824  * Constructs this iterator and initialises it with a
825  * std::string::const_reverse_iterator object. It should represent
826  * the beginning of a UTF-8 character (eg std::string::rbegin()) or
827  * comprise std::string::rend(). It will not throw provided that copy
828  * constructing a std::string::const_iterator object does not throw,
829  * as it will not in any sane implementation. This is a type
830  * conversion constructor (it is not marked explicit) so that it can
831  * be used with Cgu::Utf8::ReverseIterator comparison operators to
832  * compare the position of Cgu::Utf8::ReverseIterator with
833  * std::string::const_reverse_iterator objects.
834  * @param iter The const_reverse_iterator.
835  */
836  ReverseIterator(const std::string::const_reverse_iterator& iter): pos(iter.base()), cache(pos) {}
837 
838 /**
839  * Constructs this iterator and initialises it with a
840  * std::string::reverse_iterator object. It should represent the
841  * beginning of a UTF-8 character (eg std::string::rbegin()) or
842  * comprise std::string::rend(). It will not throw provided that copy
843  * constructing a std::string::const_iterator object does not throw,
844  * as it will not in any sane implementation. This is a type
845  * conversion constructor (it is not marked explicit) so that it can
846  * be used with Cgu::Utf8::ReverseIterator comparison operators to
847  * compare the position of Cgu::Utf8::ReverseIterator with
848  * std::string::reverse_iterator objects.
849  * @param iter The reverse_iterator.
850  */
851  ReverseIterator(const std::string::reverse_iterator& iter): pos(iter.base()), cache(pos) {}
852 
853 /**
854  * Constructs this iterator and initialises it with another
855  * Cgu::Utf8::ReverseIterator object. It will not throw provided that
856  * copy constructing a std::string::const_iterator object does not
857  * throw, as it will not in any sane implementation.
858  * @param iter The iterator.
859  */
860  ReverseIterator(const ReverseIterator& iter): pos(iter.pos), cache(iter.cache) {}
861 
862 /**
863  * Constructs this iterator and initialises it with a
864  * Cgu::Utf8::Iterator object, so that this iterator adopts the same
865  * physical position (but the logical position will be offset to the
866  * previous UTF-8 character in the std::string object concerned). It
867  * will not throw provided that copy constructing a
868  * std::string::const_iterator object does not throw, as it will not
869  * in any sane implementation.
870  * @param iter The iterator.
871  */
872  explicit ReverseIterator(const Iterator& iter): pos(iter.base()), cache(pos) {}
873 
874 /**
875  * The default constructor will not throw.
876  */
878 
879 /* Only has effect if --with-glib-memory-slices-compat or
880  * --with-glib-memory-slices-no-compat option picked */
882 };
883 
885 
886  if (pos > cache) pos = cache;
887 
888  else {
889  // we might be iterating from std::string::end()/std::string::rbegin() so
890  // we need to decrement before dereferencing and then increment again
891  const std::string::value_type* tmp = &(*(pos-1));
892  ++tmp;
893  pos -= tmp - g_utf8_prev_char(tmp);
894  }
895  return *this;
896 }
897 
899  ReverseIterator tmp{*this};
900  ++(*this);
901  return tmp;
902 }
903 
905  cache = pos;
906  const std::string::value_type* tmp = &(*pos);
907  // using g_utf8_next_char is safe even when pos points to the first character -
908  // that macro calls up the g_utf8_skip look-up table rather than attempting to
909  // read the following character, so we can safely iterate to std::string::rbegin()
910  pos += g_utf8_next_char(tmp) - tmp;
911  return *this;
912 }
913 
915  ReverseIterator tmp{*this};
916  --(*this);
917  return tmp;
918 }
919 
921  Iterator tmp{*this};
922  --tmp;
923  cache = tmp.base();
924  return g_utf8_get_char(&(*(tmp.base())));
925 }
926 
927 /**
928  * The comparison operators will not throw provided assigning a
929  * std::string::const_iterator object does not throw, as it will not
930  * in any sane implementation.
931  */
932 inline bool operator==(const ReverseIterator& iter1, const ReverseIterator& iter2) {
933  return (iter1.base() == iter2.base());
934 }
935 
936 /**
937  * The comparison operators will not throw provided assigning a
938  * std::string::const_iterator object does not throw, as it will not
939  * in any sane implementation.
940  */
941 inline bool operator!=(const ReverseIterator& iter1, const ReverseIterator& iter2) {
942  return (iter1.base() != iter2.base());
943 }
944 
945 /**
946  * The comparison operators will not throw provided assigning a
947  * std::string::const_iterator object does not throw, as it will not
948  * in any sane implementation. Ordering is viewed from the
949  * perspective of the logical operation (reverse iteration), so that
950  * for example an iterator at position std::string::rbegin() is less
951  * than an iterator at position std::string::rend().
952  */
953 inline bool operator<(const ReverseIterator& iter1, const ReverseIterator& iter2) {
954  return (iter1.base() > iter2.base());
955 }
956 
957 /**
958  * The comparison operators will not throw provided assigning a
959  * std::string::const_iterator object does not throw, as it will not
960  * in any sane implementation. Ordering is viewed from the
961  * perspective of the logical operation (reverse iteration), so that
962  * for example an iterator at position std::string::rbegin() is less
963  * than an iterator at position std::string::rend().
964  */
965 inline bool operator<=(const ReverseIterator& iter1, const ReverseIterator& iter2) {
966  return (iter1.base() >= iter2.base());
967 }
968 
969 /**
970  * The comparison operators will not throw provided assigning a
971  * std::string::const_iterator object does not throw, as it will not
972  * in any sane implementation. Ordering is viewed from the
973  * perspective of the logical operation (reverse iteration), so that
974  * for example an iterator at position std::string::rbegin() is less
975  * than an iterator at position std::string::rend().
976  */
977 inline bool operator>(const ReverseIterator& iter1, const ReverseIterator& iter2) {
978  return (iter1.base() < iter2.base());
979 }
980 
981 /**
982  * The comparison operators will not throw provided assigning a
983  * std::string::const_iterator object does not throw, as it will not
984  * in any sane implementation. Ordering is viewed from the
985  * perspective of the logical operation (reverse iteration), so that
986  * for example an iterator at position std::string::rbegin() is less
987  * than an iterator at position std::string::rend().
988  */
989 inline bool operator>=(const ReverseIterator& iter1, const ReverseIterator& iter2) {
990  return (iter1.base() <= iter2.base());
991 }
992 
993 /*** Iterator class methods which require ReverseIterator as a complete type ***/
994 
996  pos = iter.base();
997  return *this;
998 }
999 
1000 inline Iterator::Iterator(const ReverseIterator& iter): pos(iter.base()) {}
1001 
1002 } // namespace Utf8
1003 
1004 } // namespace Cgu
1005 
1006 #endif
Cgu::Utf8::ReverseIterator::reference
gunichar reference
Definition: convert.h:700
Cgu::Utf8::operator>=
bool operator>=(const Iterator &iter1, const Iterator &iter2)
Definition: convert.h:643
Cgu::Utf8::ReverseIterator::ReverseIterator
ReverseIterator(const std::string::reverse_iterator &iter)
Definition: convert.h:851
Cgu::Utf8::wide_to_utf8
std::string wide_to_utf8(const std::wstring &input)
Cgu::Utf8::ReverseIterator::base
std::string::const_iterator base() const
Definition: convert.h:821
Cgu
Definition: application.h:44
Cgu::Utf8::ReverseIterator::operator=
ReverseIterator & operator=(const Iterator &iter)
Definition: convert.h:800
Cgu::Utf8::utf32_from_utf8
std::u32string utf32_from_utf8(const std::string &input)
Cgu::Utf8::Iterator::reference
gunichar reference
Definition: convert.h:394
Cgu::Utf8::ConversionError::~ConversionError
~ConversionError()
Definition: convert.h:97
Cgu::Utf8::ReverseIterator::operator=
ReverseIterator & operator=(const std::string::reverse_iterator &iter)
Definition: convert.h:777
Cgu::Utf8::ConversionError::ConversionError
ConversionError(const char *msg)
Definition: convert.h:93
Cgu::Utf8::Iterator::Iterator
Iterator(const Iterator &iter)
Definition: convert.h:540
Cgu::Utf8::Iterator::Iterator
Iterator()
Definition: convert.h:556
Cgu::Utf8::ReverseIterator::difference_type
std::string::difference_type difference_type
Definition: convert.h:702
Cgu::Utf8::ReverseIterator::ReverseIterator
ReverseIterator(const std::string::const_reverse_iterator &iter)
Definition: convert.h:836
Cgu::Utf8::validate
bool validate(const std::string &text)
Definition: convert.h:305
Cgu::Utf8::ReverseIterator
A class which will iterate in reverse through a std::string object by reference to unicode characters...
Definition: convert.h:697
Cgu::Utf8::locale_from_utf8
std::string locale_from_utf8(const std::string &input)
Cgu::Utf8::locale_to_utf8
std::string locale_to_utf8(const std::string &input)
Cgu::Utf8::ReverseIterator::operator=
ReverseIterator & operator=(const std::string::const_reverse_iterator &iter)
Definition: convert.h:765
Cgu::Utf8::ReverseIterator::operator--
ReverseIterator & operator--()
Definition: convert.h:904
Cgu::Utf8::operator==
bool operator==(const Iterator &iter1, const Iterator &iter2)
Definition: convert.h:598
Cgu::Utf8::Iterator::pointer
void pointer
Definition: convert.h:395
Cgu::Utf8::Iterator::operator=
Iterator & operator=(const std::string::const_iterator &iter)
Definition: convert.h:449
Cgu::Utf8::ReverseIterator::ReverseIterator
ReverseIterator(const ReverseIterator &iter)
Definition: convert.h:860
Cgu::Utf8::ConversionError
Definition: convert.h:89
Cgu::Utf8::operator<
bool operator<(const Iterator &iter1, const Iterator &iter2)
Definition: convert.h:616
Cgu::Utf8::Iterator::operator=
Iterator & operator=(const Iterator &iter)
Definition: convert.h:471
Cgu::Utf8::Iterator::difference_type
std::string::difference_type difference_type
Definition: convert.h:396
Cgu::Utf8::wide_from_utf8
std::wstring wide_from_utf8(const std::string &input)
Cgu::SharedHandle< gchar *, GFree >
Cgu::Utf8::Iterator::Iterator
Iterator(const std::string::const_iterator &iter)
Definition: convert.h:516
Cgu::Utf8::Iterator::operator*
Iterator::value_type operator*() const
Definition: convert.h:491
Cgu::Utf8::utf32_to_utf8
std::string utf32_to_utf8(const std::u32string &input)
Cgu::Utf8::ReverseIterator::operator=
ReverseIterator & operator=(const ReverseIterator &iter)
Definition: convert.h:787
Cgu::Utf8::uniwide_to_utf8
std::string uniwide_to_utf8(const std::wstring &input)
Cgu::Utf8::ReverseIterator::operator*
ReverseIterator::value_type operator*() const
Definition: convert.h:920
CGU_GLIB_MEMORY_SLICES_FUNCS
#define CGU_GLIB_MEMORY_SLICES_FUNCS
Definition: cgu_config.h:84
shared_handle.h
Cgu::Utf8::ReverseIterator::iterator_category
std::bidirectional_iterator_tag iterator_category
Definition: convert.h:703
Cgu::Utf8::utf16_to_utf8
std::string utf16_to_utf8(const std::u16string &input)
Cgu::Utf8::ReverseIterator::ReverseIterator
ReverseIterator()
Definition: convert.h:877
Cgu::Utf8::ReverseIterator::value_type
gunichar value_type
Definition: convert.h:699
Cgu::Utf8::Iterator::operator++
Iterator & operator++()
Definition: convert.h:563
Cgu::Utf8::ReverseIterator::ReverseIterator
ReverseIterator(const Iterator &iter)
Definition: convert.h:872
Cgu::Utf8::operator>
bool operator>(const Iterator &iter1, const Iterator &iter2)
Definition: convert.h:634
Cgu::Utf8::Iterator::base
std::string::const_iterator base() const
Definition: convert.h:501
Cgu::Utf8::filename_to_utf8
std::string filename_to_utf8(const std::string &input)
Cgu::Utf8::operator<=
bool operator<=(const Iterator &iter1, const Iterator &iter2)
Definition: convert.h:625
Cgu::Utf8::Iterator::value_type
gunichar value_type
Definition: convert.h:393
Cgu::Utf8::ConversionError::ConversionError
ConversionError(GError *error)
Definition: convert.h:95
Cgu::Utf8::filename_from_utf8
std::string filename_from_utf8(const std::string &input)
Cgu::Utf8::operator!=
bool operator!=(const Iterator &iter1, const Iterator &iter2)
Definition: convert.h:607
Cgu::Utf8::uniwide_from_utf8
std::wstring uniwide_from_utf8(const std::string &input)
Cgu::Utf8::Iterator
A class which will iterate through a std::string object by reference to unicode characters rather tha...
Definition: convert.h:391
Cgu::Utf8::ConversionError::what
virtual const char * what() const
Definition: convert.h:92
Cgu::Utf8::Iterator::iterator_category
std::bidirectional_iterator_tag iterator_category
Definition: convert.h:397
Cgu::Utf8::Iterator::operator=
Iterator & operator=(const std::string::iterator &iter)
Definition: convert.h:461
Cgu::Utf8::ReverseIterator::operator++
ReverseIterator & operator++()
Definition: convert.h:884
Cgu::Utf8::Iterator::Iterator
Iterator(const std::string::iterator &iter)
Definition: convert.h:531
Cgu::Utf8::ReverseIterator::pointer
void pointer
Definition: convert.h:701
Cgu::SharedHandle::get
T get() const
Definition: shared_handle.h:765
cgu_config.h
Cgu::Utf8::utf16_from_utf8
std::u16string utf16_from_utf8(const std::string &input)
Cgu::Utf8::Iterator::operator--
Iterator & operator--()
Definition: convert.h:578