c++-gtk-utils
convert.h
Go to the documentation of this file.
1 /* Copyright (C) 2005 to 2013 Chris Vine
2 
3 The library comprised in this file or of which this file is part is
4 distributed by Chris Vine under the GNU Lesser General Public
5 License as follows:
6 
7  This library is free software; you can redistribute it and/or
8  modify it under the terms of the GNU Lesser General Public License
9  as published by the Free Software Foundation; either version 2.1 of
10  the License, or (at your option) any later version.
11 
12  This library is distributed in the hope that it will be useful, but
13  WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  Lesser General Public License, version 2.1, for more details.
16 
17  You should have received a copy of the GNU Lesser General Public
18  License, version 2.1, along with this library (see the file LGPL.TXT
19  which came with this source code package in the src/utils sub-directory);
20  if not, write to the Free Software Foundation, Inc.,
21  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
22 
23 However, it is not intended that the object code of a program whose
24 source code instantiates a template from this file or uses macros or
25 inline functions (of any length) should by reason only of that
26 instantiation or use be subject to the restrictions of use in the GNU
27 Lesser General Public License. With that in mind, the words "and
28 macros, inline functions and instantiations of templates (of any
29 length)" shall be treated as substituted for the words "and small
30 macros and small inline functions (ten lines or less in length)" in
31 the fourth paragraph of section 5 of that licence. This does not
32 affect any other reason why object code may be subject to the
33 restrictions in that licence (nor for the avoidance of doubt does it
34 affect the application of section 2 of that licence to modifications
35 of the source code in this file).
36 
37 */
38 
39 #ifndef CGU_CONVERT_H
40 #define CGU_CONVERT_H
41 
42 #include <string>
43 #include <iterator>
44 #include <exception>
45 
46 #include <glib.h>
47 
49 
50 namespace Cgu {
51 
52 /**
53  * @file convert.h
54  * @brief This file contains functions for converting between
55  * character sets.
56  *
57  * \#include <c++-gtk-utils/convert.h>
58  *
59  * This file contains functions for converting between character sets.
60  * If you want these functions to work, you will generally have needed
61  * to have set the locale in the relevant program with either
62  * <em>std::locale::global(std::locale(""))</em> (from the C++
63  * standard library) or <em>setlocale(LC_ALL,"")</em> (from the C
64  * standard library).
65  */
66 
67 /**
68  * @namespace Cgu::Utf8
69  * @brief This namespace contains utilities relevant to the use of
70  * UTF-8 in programs.
71  *
72  * \#include <c++-gtk-utils/convert.h> (for conversion and validation
73  * functions)
74  *
75  * \#include <c++-gtk-utils/reassembler.h> (for Reassembler class)
76  * @sa convert.h reassembler.h
77  *
78  * This namespace contains utilities relevant to the use of UTF-8 in
79  * programs. If you want these functions to work, you will generally
80  * have needed to have set the locale in the relevant program with
81  * either <em>std::locale::global(std::locale(""))</em> (from the C++
82  * standard library) or <em>setlocale(LC_ALL,"")</em> (from the C standard
83  * library).
84  */
85 
86 namespace Utf8 {
87 
88 class ConversionError: public std::exception {
89  gchar* message;
90 public:
91  virtual const char* what() const throw() {return (const char*)message;}
92 
94  g_free(message);
95  message = g_strdup(e.message);
96  return *this;
97  }
98  ConversionError(const ConversionError& e) throw(): message(g_strdup(e.message)) {}
99  ConversionError(const char* msg) throw():
100  message(g_strdup_printf("Utf8::ConversionError: %s", msg)) {}
101  ConversionError(GError* error) throw():
102  message(g_strdup_printf("Utf8::ConversionError: %s", error->message)) {}
103  ~ConversionError() throw() {g_free(message);}
104 };
105 
106 /**
107  * Converts text from UTF-8 to the system's Unicode wide character
108  * representation, which will be UCS-4/UTF-32 for systems with a wide
109  * character size of 4 (almost all unix-like systems), and UTF-16 for
110  * systems with a wide character size of 2.
111  * @param input Text in valid UTF-8 format.
112  * @return The input text converted to UCS-4 or UTF-16.
113  * @exception Cgu::Utf8::ConversionError This exception will be thrown
114  * if conversion fails because the input string is not in valid UTF-8
115  * format or the system does not support wide character Unicode
116  * strings.
117  * @exception std::bad_alloc This function might throw std::bad_alloc
118  * if memory is exhausted and the system throws in that case.
119  *
120  * Since 0.9.2
121  */
122 std::wstring uniwide_from_utf8(const std::string& input);
123 
124 /**
125  * Converts text from the system's Unicode wide character
126  * representation, which will be UCS-4/UTF-32 for systems with a wide
127  * character size of 4 (almost all unix-like systems) and UTF-16 for
128  * systems with a wide character size of 2, to narrow character UTF-8
129  * format.
130  * @param input Text in valid UCS-4 or UTF-16 format.
131  * @return The input text converted to UTF-8.
132  * @exception Cgu::Utf8::ConversionError This exception will be thrown
133  * if conversion fails because the input string is not in valid UCS-4
134  * or UTF-16 format or the system does not support wide character
135  * Unicode strings.
136  * @exception std::bad_alloc This function might throw std::bad_alloc
137  * if memory is exhausted and the system throws in that case.
138  *
139  * Since 0.9.2
140  */
141 std::string uniwide_to_utf8(const std::wstring& input);
142 
143 
144 /**
145  * Converts text from UTF-8 to the system's wide character locale
146  * representation. For this function to work correctly, the system's
147  * installed iconv() must support conversion to a generic wchar_t
148  * target, but in POSIX whether it does so is implementation defined
149  * (GNU's C library implemention does). For most unix-like systems
150  * the wide character representation will be Unicode (UCS-4/UTF-32 or
151  * UTF-16), and where that is the case use the uniwide_from_utf8()
152  * function instead, which will not rely on the generic target being
153  * available.
154  * @param input Text in valid UTF-8 format.
155  * @return The input text converted to the system's wide character
156  * locale representation.
157  * @exception Cgu::Utf8::ConversionError This exception will be thrown
158  * if conversion fails because the input string is not in valid UTF-8
159  * format, or cannot be converted to the system's wide character
160  * locale representation (eg because the input characters cannot be
161  * represented by that encoding, or the system's installed iconv()
162  * function does not support conversion to a generic wchar_t target).
163  * @exception std::bad_alloc This function might throw std::bad_alloc
164  * if memory is exhausted and the system throws in that case.
165  *
166  * Since 0.9.2
167  */
168 std::wstring wide_from_utf8(const std::string& input);
169 
170 
171 /**
172  * Converts text from the system's wide character locale
173  * representation to UTF-8. For this function to work correctly, the
174  * system's installed iconv() must support conversion from a generic
175  * wchar_t target, but in POSIX whether it does so is implementation
176  * defined (GNU's C library implemention does). For most unix-like
177  * systems the wide character representation will be Unicode
178  * (UCS-4/UTF-32 or UTF-16), and where that is the case use the
179  * uniwide_to_utf8() function instead, which will not rely on the
180  * generic target being available.
181  * @param input Text in a valid wide character locale format.
182  * @return The input text converted to UTF-8.
183  * @exception Cgu::Utf8::ConversionError This exception will be thrown
184  * if conversion fails because the input string is not in a valid wide
185  * character locale format, or cannot be converted to UTF-8 (eg
186  * because the system's installed iconv() function does not support
187  * conversion from a generic wchar_t target).
188  * @exception std::bad_alloc This function might throw std::bad_alloc
189  * if memory is exhausted and the system throws in that case.
190  *
191  * Since 0.9.2
192  */
193 std::string wide_to_utf8(const std::wstring& input);
194 
195 /**
196  * Converts text from UTF-8 to the system's filename encoding.
197  * @param input Text in valid UTF-8 format.
198  * @return The input text converted to filename encoding.
199  * @exception Cgu::Utf8::ConversionError This exception will be thrown
200  * if conversion fails because the input string is not in valid UTF-8
201  * format, or cannot be converted to filename encoding (eg because the
202  * input characters cannot be represented by that encoding).
203  * @exception std::bad_alloc This function might throw std::bad_alloc
204  * if memory is exhausted and the system throws in that case.
205  * @note glib takes the system's filename encoding from the
206  * environmental variables G_FILENAME_ENCODING and G_BROKEN_FILENAMES.
207  * If G_BROKEN_FILENAMES is set to 1 and G_FILENAME_ENCODING is not
208  * set, it will be assumed that the filename encoding is the same as
209  * the locale encoding. If G_FILENAME_ENCODING is set, then
210  * G_BROKEN_FILENAMES is ignored, and filename encoding is taken from
211  * the value held by G_FILENAME_ENCODING.
212  *
213  * Since 0.9.2
214  */
215 std::string filename_from_utf8(const std::string& input);
216 
217 /**
218  * Converts text from the system's filename encoding to UTF-8.
219  * @param input Text in valid filename encoding.
220  * @return The input text converted to UTF-8.
221  * @exception Cgu::Utf8::ConversionError This exception will be thrown
222  * if conversion fails because the input string is not in valid
223  * filename encoding.
224  * @exception std::bad_alloc This function might throw std::bad_alloc
225  * if memory is exhausted and the system throws in that case.
226  * @note glib takes the system's filename encoding from the
227  * environmental variables G_FILENAME_ENCODING and G_BROKEN_FILENAMES.
228  * If G_BROKEN_FILENAMES is set to 1 and G_FILENAME_ENCODING is not
229  * set, it will be assumed that the filename encoding is the same as
230  * the locale encoding. If G_FILENAME_ENCODING is set, then
231  * G_BROKEN_FILENAMES is ignored, and filename encoding is taken from
232  * the value held by G_FILENAME_ENCODING.
233  *
234  * Since 0.9.2
235  */
236 std::string filename_to_utf8(const std::string& input);
237 
238 /**
239  * Converts text from UTF-8 to the system's locale encoding.
240  * @param input Text in valid UTF-8 format.
241  * @return The input text converted to locale encoding.
242  * @exception Cgu::Utf8::ConversionError This exception will be thrown
243  * if conversion fails because the input string is not in valid UTF-8
244  * format, or cannot be converted to locale encoding (eg because the
245  * input characters cannot be represented by that encoding).
246  * @exception std::bad_alloc This function might throw std::bad_alloc
247  * if memory is exhausted and the system throws in that case.
248  *
249  * Since 0.9.2
250  */
251 std::string locale_from_utf8(const std::string& input);
252 
253 /**
254  * Converts text from the system's locale encoding to UTF-8.
255  * @param input Text in valid locale encoding.
256  * @return The input text converted to UTF-8.
257  * @exception Cgu::Utf8::ConversionError This exception will be thrown
258  * if conversion fails because the input string is not in valid locale
259  * encoding.
260  * @exception std::bad_alloc This function might throw std::bad_alloc
261  * if memory is exhausted and the system throws in that case.
262  *
263  * Since 0.9.2
264  */
265 std::string locale_to_utf8(const std::string& input);
266 
267 /**
268  * Indicates whether the input text comprises valid UTF-8.
269  * @param text The text to be tested.
270  * @return true if the input text is in valid UTF-8 format, otherwise
271  * false.
272  * @exception std::bad_alloc This function might throw std::bad_alloc
273  * if std::string::data() might throw when memory is exhausted.
274  * @note \#include <c++-gtk-utils/convert.h> for this function.
275  *
276  * Since 0.9.2
277  */
278 inline bool validate(const std::string& text) {
279  return g_utf8_validate(text.data(), text.size(), 0);
280 }
281 
282 /************** Iterator class **************/
283 
284 /**
285  * @class Iterator convert.h c++-gtk-utils/convert.h
286  * @brief A class which will iterate through a std::string object by
287  * reference to unicode characters rather than by bytes.
288  * @sa Cgu::Utf8::ReverseIterator
289  *
290  * The Cgu::Utf8::Iterator class does the same as
291  * std::string::const_iterator, except that when iterating through a
292  * std::string object using the ++ and - - postfix and prefix
293  * operators, it iterates by increments of whole unicode code points
294  * rather than by reference to bytes. In addition, the dereferencing
295  * operator returns the whole unicode code point (a UCS-4 gunichar
296  * type) rather than a char type.
297  *
298  * Where, as in practically all unix-like systems, sizeof(wchar_t) ==
299  * 4, then the gunichar return value of the dereferencing operator can
300  * be converted by a simple static_cast to the wchar_t type. So far
301  * as displaying individual code points is concerned however, it
302  * should be noted that because unicode allows combining characters, a
303  * unicode code point may not contain the whole representation of a
304  * character as displayed. This effect can be dealt with for all
305  * characters capable of representation by Level 1 unicode (ie by
306  * precomposed characters) using g_utf8_normalize() before iterating.
307  * There will still however be some non-European scripts, in
308  * particular some Chinese/Japanese/Korean ideograms, where
309  * description of the ideogram requires more than one code point to be
310  * finally resolved. For these, printing individual code points
311  * sequentially one by one directly to a display (say with std::wcout)
312  * may or not may not have the desired result, depending on how the
313  * display device (eg console) deals with that case.
314  *
315  * A Cgu::Utf8::Iterator only allows reading from and not writing to
316  * the std::string object being iterated through. This is because in
317  * UTF-8 the representation of any one unicode code point will require
318  * between 1 and 6 bytes: accordingly modifying a UTF-8 string may
319  * change its length (in bytes) even though the number of unicode
320  * characters stays the same. For the same reason, this iterator is a
321  * bidirectional iterator but not a random access iterator.
322  *
323  * The std::string object concerned should contain valid UTF-8 text.
324  * If necessary, this should be checked with Cgu::Utf8::validate()
325  * first. In addition, before use, the Cgu::Utf8::Iterator object
326  * must be initialized by a std::string::const_iterator or
327  * std::string::iterator object pointing to the first byte of a valid
328  * UTF-8 character in the string (or by another Cgu::Utf8::Iterator
329  * object or by a Cgu::Utf8::ReverseIterator object), and iteration
330  * will begin at the point of initialization: therefore, assuming the
331  * string contains valid UTF-8 text, passing std::string::begin() to a
332  * Cgu::Utf8::Iterator object will always be safe. Initialization by
333  * std::string::end() is also valid if the first iteration is
334  * backwards with the \-- operator. This initialization can be done
335  * either in the constructor or by assignment. Comparison operators
336  * ==, !=, <, <=, > and >= are provided enabling the position of
337  * Cgu::Utf8::Iterator objects to be compared with each other or with
338  * std::string::const_iterator and std::string::iterator objects.
339  *
340  * This is an example:
341  * @code
342  * using namespace Cgu;
343  *
344  * std::wstring wide_str(L"ßøǿón");
345  * std::string narrow_str(Utf8::uniwide_to_utf8(wide_str));
346  *
347  * Utf8::Iterator iter;
348  * for (iter = narrow_str.begin();
349  * iter != narrow_str.end();
350  * ++iter)
351  * std::wcout << static_cast<wchar_t>(*iter) << std::endl;
352  * @endcode
353  *
354  * This class assumes in using g_utf8_next_char(), g_utf8_prev_char()
355  * and g_utf8_get_char() that the std::string object keeps its
356  * internal string in contiguous storage. This is required by the
357  * C++11 standard, but not formally by C++98/C++03. However, known
358  * implementations of std::string in fact store the string
359  * contiguously.
360  */
361 
362 class ReverseIterator;
363 
364 class Iterator {
365 public:
366  typedef gunichar value_type;
367  typedef gunichar reference; // read only
368  typedef void pointer; // read only
369  typedef std::string::difference_type difference_type;
370  typedef std::bidirectional_iterator_tag iterator_category;
371 
372 private:
373  std::string::const_iterator pos;
374 public:
375 
376 /**
377  * Increments the iterator so that it moves from the beginning of the
378  * current UTF-8 character to the beginning of the next UTF-8
379  * character. It is a prefix operator. It will not throw.
380  * @return A reference to the iterator in its new position.
381  *
382  * Since 1.0.1
383  */
384  Iterator& operator++();
385 
386 /**
387  * Increments the iterator so that it moves from the beginning of the
388  * current UTF-8 character to the beginning of the next UTF-8
389  * character. It is a postfix operator. It will not throw provided
390  * that copy constructing and assigning a std::string::const_iterator
391  * object does not throw, as it will not in any sane implementation.
392  * @return A copy of the iterator in its former position.
393  *
394  * Since 1.0.1
395  */
396  Iterator operator++(int);
397 
398 /**
399  * Decrements the iterator so that it moves from the beginning of the
400  * current UTF-8 character to the beginning of the previous UTF-8
401  * character. It is a prefix operator. It will not throw.
402  * @return A reference to the iterator in its new position.
403  *
404  * Since 1.0.1
405  */
406  Iterator& operator--();
407 
408 /**
409  * Decrements the iterator so that it moves from the beginning of the
410  * current UTF-8 character to the beginning of the previous UTF-8
411  * character. It is a postfix operator. It will not throw provided
412  * that copy constructing and assigning a std::string::const_iterator
413  * object does not throw, as it will not in any sane implementation.
414  * @return A copy of the iterator in its former position.
415  *
416  * Since 1.0.1
417  */
418  Iterator operator--(int);
419 
420 /**
421  * Assigns a std::string::const_iterator object to this object. It
422  * should point to the beginning of a UTF-8 character (eg
423  * std::string::begin()) or to std::string::end(). It will not throw
424  * provided assigning a std::string::const_iterator object does not
425  * throw, as it will not in any sane implementation.
426  * @param iter The std::string::const_iterator.
427  * @return A reference to this Cgu::Utf8::Iterator object after
428  * assignment.
429  *
430  * Since 1.0.1
431  */
432  Iterator& operator=(const std::string::const_iterator& iter) {pos = iter; return *this;}
433 
434 /**
435  * Assigns a std::string::iterator object to this object. It should
436  * point to the beginning of a UTF-8 character (eg
437  * std::string::begin()) or to std::string::end(). It will not throw
438  * provided assigning a std::string::const_iterator object does not
439  * throw, as it will not in any sane implementation.
440  * @param iter The std::string::iterator.
441  * @return A reference to this Cgu::Utf8::Iterator object after
442  * assignment.
443  *
444  * Since 1.0.1
445  */
446  Iterator& operator=(const std::string::iterator& iter) {pos = iter; return *this;}
447 
448 /**
449  * Assigns a Cgu::Utf8::Iterator object to this object. It will not
450  * throw provided assigning a std::string::const_iterator object does
451  * not throw, as it will not in any sane implementation.
452  * @param iter The iterator.
453  * @return A reference to this Cgu::Utf8::Iterator object after
454  * assignment.
455  *
456  * Since 1.0.1
457  */
458  Iterator& operator=(const Iterator& iter) {pos = iter.pos; return *this;}
459 
460 /**
461  * Assigns a Cgu::Utf8::ReverseIterator object to this object, so that
462  * this iterator adopts the same physical position (but the logical
463  * position will be offset to the following UTF-8 character). It will
464  * not throw provided assigning a std::string::const_iterator object
465  * does not throw, as it will not in any sane implementation.
466  * @param iter The iterator.
467  * @return A reference to this Cgu::Utf8::Iterator object after
468  * assignment.
469  *
470  * Since 1.0.1
471  */
472  Iterator& operator=(const ReverseIterator& iter);
473 
474 /**
475  * The dereference operator.
476  * @return A 32-bit gunichar object containing the whole unicode code
477  * point which is currently represented by this iterator. It will not
478  * throw.
479  *
480  * Since 1.0.1
481  */
482  Iterator::value_type operator*() const {return g_utf8_get_char(&(*pos));}
483 
484 /**
485  * @return The current underlying std::string::const_iterator kept by
486  * this iterator. Once this iterator has been correctly initialized,
487  * that will point to the beginning of the UTF-8 character currently
488  * represented by this iterator or to std::string::end(). It will not
489  * throw provided assigning a std::string::const_iterator object does
490  * not throw, as it will not in any sane implementation.
491  *
492  * Since 1.0.1
493  */
494  std::string::const_iterator base() const {return pos;}
495 
496 /**
497  * Constructs this iterator and initialises it with a
498  * std::string::const_iterator object. It should point to the
499  * beginning of a UTF-8 character (eg std::string::begin()) or to
500  * std::string::end(). It will not throw provided that copy
501  * constructing a std::string::const_iterator object does not throw,
502  * as it will not in any sane implementation. This is a type
503  * conversion constructor (it is not marked explicit) so that it can
504  * be used with Cgu::Utf8::Iterator comparison operators to compare
505  * the position of Cgu::Utf8::Iterator with
506  * std::string::const_iterator objects.
507  * @param iter The std::string::const_iterator.
508  *
509  * Since 1.0.1
510  */
511  Iterator(const std::string::const_iterator& iter): pos(iter) {}
512 
513 /**
514  * Constructs this iterator and initialises it with a
515  * std::string::iterator object. It should point to the beginning of
516  * a UTF-8 character (eg std::string::begin()) or to
517  * std::string::end(). It will not throw provided that copy
518  * constructing a std::string::const_iterator object does not throw,
519  * as it will not in any sane implementation. This is a type
520  * conversion constructor (it is not marked explicit) so that it can
521  * be used with Cgu::Utf8::Iterator comparison operators to compare
522  * the position of Cgu::Utf8::Iterator with std::string::iterator
523  * objects.
524  * @param iter The std::string::iterator.
525  *
526  * Since 1.0.1
527  */
528  Iterator(const std::string::iterator& iter): pos(iter) {}
529 
530 /**
531  * Constructs this iterator and initialises it with another
532  * Cgu::Utf8::Iterator object. It will not throw provided that copy
533  * constructing a std::string::const_iterator object does not throw,
534  * as it will not in any sane implementation.
535  * @param iter The iterator.
536  *
537  * Since 1.0.1
538  */
539  Iterator(const Iterator& iter): pos(iter.pos) {}
540 
541 /**
542  * Constructs this iterator and initialises it with a
543  * Cgu::Utf8::ReverseIterator object, so that this iterator adopts the
544  * same physical position (but the logical position will be offset to
545  * the following UTF-8 character). It will not throw provided that
546  * copy constructing a std::string::const_iterator object does not
547  * throw, as it will not in any sane implementation.
548  * @param iter The iterator.
549  *
550  * Since 1.0.1
551  */
552  explicit Iterator(const ReverseIterator& iter);
553 
554 /**
555  * The default constructor will not throw.
556  *
557  * Since 1.0.1
558  */
559  Iterator() {}
560 
561 /* Only has effect if --with-glib-memory-slices-compat or
562  * --with-glib-memory-slices-no-compat option picked */
564 };
565 
567  const std::string::value_type* tmp = &(*pos);
568  // using g_utf8_next_char is safe even when pos points to the last character -
569  // that macro calls up the g_utf8_skip look-up table rather than attempting to
570  // read the following character, so we can safely iterate to std::string::end()
571  pos += g_utf8_next_char(tmp) - tmp;
572  return *this;
573 }
574 
576  Iterator tmp(*this);
577  ++(*this);
578  return tmp;
579 }
580 
582  // we might be iterating from std::string::end() so we need
583  // to decrement before dereferencing and then increment again
584  const std::string::value_type* tmp = &(*(pos-1));
585  ++tmp;
586  pos -= tmp - g_utf8_prev_char(tmp);
587  return *this;
588 }
589 
591  Iterator tmp(*this);
592  --(*this);
593  return tmp;
594 }
595 
596 /**
597  * The comparison operators will not throw provided assigning a
598  * std::string::const_iterator object does not throw, as it will not
599  * in any sane implementation.
600  *
601  * Since 1.0.1
602  */
603 inline bool operator==(const Iterator& iter1, const Iterator& iter2) {
604  return (iter1.base() == iter2.base());
605 }
606 
607 /**
608  * The comparison operators will not throw provided assigning a
609  * std::string::const_iterator object does not throw, as it will not
610  * in any sane implementation.
611  *
612  * Since 1.0.1
613  */
614 inline bool operator!=(const Iterator& iter1, const Iterator& iter2) {
615  return (iter1.base() != iter2.base());
616 }
617 
618 /**
619  * The comparison operators will not throw provided assigning a
620  * std::string::const_iterator object does not throw, as it will not
621  * in any sane implementation.
622  *
623  * Since 1.0.1
624  */
625 inline bool operator<(const Iterator& iter1, const Iterator& iter2) {
626  return (iter1.base() < iter2.base());
627 }
628 
629 /**
630  * The comparison operators will not throw provided assigning a
631  * std::string::const_iterator object does not throw, as it will not
632  * in any sane implementation.
633  *
634  * Since 1.0.1
635  */
636 inline bool operator<=(const Iterator& iter1, const Iterator& iter2) {
637  return (iter1.base() <= iter2.base());
638 }
639 
640 /**
641  * The comparison operators will not throw provided assigning a
642  * std::string::const_iterator object does not throw, as it will not
643  * in any sane implementation.
644  *
645  * Since 1.0.1
646  */
647 inline bool operator>(const Iterator& iter1, const Iterator& iter2) {
648  return (iter1.base() > iter2.base());
649 }
650 
651 /**
652  * The comparison operators will not throw provided assigning a
653  * std::string::const_iterator object does not throw, as it will not
654  * in any sane implementation.
655  *
656  * Since 1.0.1
657  */
658 inline bool operator>=(const Iterator& iter1, const Iterator& iter2) {
659  return (iter1.base() >= iter2.base());
660 }
661 
662 /************** ReverseIterator class **************/
663 
664 /**
665  * @class ReverseIterator convert.h c++-gtk-utils/convert.h
666  * @brief A class which will iterate in reverse through a std::string
667  * object by reference to unicode characters rather than by bytes.
668  * @sa Cgu::Utf8::Iterator
669  *
670  * The Cgu::Utf8::ReverseIterator class does the same as
671  * std::string::const_reverse_iterator, except that when iterating
672  * through a std::string object using the ++ and - - postfix and
673  * prefix operators, it iterates by increments of whole unicode code
674  * points rather than by reference to bytes. In addition, the
675  * dereferencing operator returns the whole unicode code point (a
676  * UCS-4 gunichar type) rather than a char type.
677  *
678  * Before use, the Cgu::Utf8::ReverseIterator object must be
679  * initialized by a std::string::const_reverse_iterator or
680  * std::string::reverse_iterator object representing the first byte of
681  * a valid UTF-8 character in the string (or by another
682  * Cgu::Utf8::ReverseIterator object or by a Cgu::Utf8::Iterator
683  * object): so assuming the string contains valid UTF-8 text, it is
684  * always valid to initialise a Cgu::Utf8::ReverseIterator with
685  * std::string::rbegin(). Initialization by std::string::rend() is
686  * also valid if the first interation is backwards with the \--
687  * operator. This initialization can be done either in the
688  * constructor or by assignment. Comparison operators ==, !=, <, <=,
689  * > and >= are provided enabling the position of
690  * Cgu::Utf8::ReverseIterator objects to be compared with each other
691  * or with std::string::const_reverse_iterator and
692  * std::string::reverse_iterator objects.
693  *
694  * This is an example:
695  * @code
696  * using namespace Cgu;
697  *
698  * std::wstring wide_str(L"ßøǿón");
699  * std::string narrow_str(Utf8::uniwide_to_utf8(wide_str));
700  *
701  * Utf8::ReverseIterator iter;
702  * for (iter = narrow_str.rbegin();
703  * iter != narrow_str.rend();
704  * ++iter)
705  * std::wcout << static_cast<wchar_t>(*iter) << std::endl;
706  * @endcode
707  *
708  * For further information on its use, see the Utf8::Iterator
709  * documentation.
710  */
711 
713 public:
714  typedef gunichar value_type;
715  typedef gunichar reference; // read only
716  typedef void pointer; // read only
717  typedef std::string::difference_type difference_type;
718  typedef std::bidirectional_iterator_tag iterator_category;
719 
720 private:
721  std::string::const_iterator pos;
722  // we use cache to make iterating and then dereferencing more efficient
723  mutable std::string::const_iterator cache;
724 public:
725 
726 /**
727  * Increments the iterator in the reverse direction so that it moves
728  * from the beginning of the current UTF-8 character to the beginning
729  * of the previous UTF-8 character in the std::string object
730  * concerned. It is a prefix operator. It will not throw provided
731  * assigning a std::string::const_iterator object does not throw, as
732  * it will not in any sane implementation.
733  * @return A reference to the iterator in its new position
734  *
735  * Since 1.0.1
736  */
738 
739 /**
740  * Increments the iterator in the reverse direction so that it moves
741  * from the beginning of the current UTF-8 character to the beginning
742  * of the previous UTF-8 character in the std::string object
743  * concerned. It is a postfix operator. It will not throw provided
744  * that copy constructing and assigning a std::string::const_iterator
745  * object does not throw, as it will not in any sane implementation.
746  * @return A copy of the iterator in its former position
747  *
748  * Since 1.0.1
749  */
751 
752 /**
753  * Decrements the iterator in the reverse direction so that it moves
754  * from the beginning of the current UTF-8 character to the beginning
755  * of the following UTF-8 character in the std::string object
756  * concerned. It is a prefix operator. It will not throw provided
757  * assigning a std::string::const_iterator object does not throw, as
758  * it will not in any sane implementation.
759  * @return A reference to the iterator in its new position
760  *
761  * Since 1.0.1
762  */
764 
765 /**
766  * Decrements the iterator in the reverse direction so that it moves
767  * from the beginning of the current UTF-8 character to the beginning
768  * of the following UTF-8 character in the std::string object
769  * concerned. It is a postfix operator. It will not throw provided
770  * that copy constructing and assigning a std::string::const_iterator
771  * object does not throw, as it will not in any sane implementation.
772  * @return A copy of the iterator in its former position
773  *
774  * Since 1.0.1
775  */
777 
778 /**
779  * Assigns a std::string::const_reverse_iterator object to this
780  * object. It should represent the beginning of a UTF-8 character (eg
781  * std::string::rbegin()) or comprise std::string::rend(). It will
782  * not throw provided assigning a std::string::const_iterator object
783  * does not throw, as it will not in any sane implementation.
784  * @param iter The const_reverse_iterator.
785  * @return A reference to this Cgu::Utf8::ReverseIterator object after
786  * assignment.
787  *
788  * Since 1.0.1
789  */
790  ReverseIterator& operator=(const std::string::const_reverse_iterator& iter) {pos = iter.base(); cache = pos; return *this;}
791 
792 /**
793  * Assigns a std::string::reverse_iterator object to this object. It
794  * should represent the beginning of a UTF-8 character (eg
795  * std::string::rbegin()) or comprise std::string::rend(). It will
796  * not throw provided assigning a std::string::const_iterator object
797  * does not throw, as it will not in any sane implementation.
798  * @param iter The reverse_iterator.
799  * @return A reference to this Cgu::Utf8::ReverseIterator object after
800  * assignment.
801  *
802  * Since 1.0.1
803  */
804  ReverseIterator& operator=(const std::string::reverse_iterator& iter) {pos = iter.base(); cache = pos; return *this;}
805 
806 /**
807  * Assigns a Cgu::Utf8::ReverseIterator object to this object. It
808  * will not throw provided assigning a std::string::const_iterator
809  * object does not throw, as it will not in any sane implementation.
810  * @param iter The iterator.
811  * @return A reference to this Cgu::Utf8::ReverseIterator object after
812  * assignment.
813  *
814  * Since 1.0.1
815  */
816  ReverseIterator& operator=(const ReverseIterator& iter) {pos = iter.pos; cache = iter.cache; return *this;}
817 
818 /**
819  * Assigns a Cgu::Utf8::Iterator object to this object, so that this
820  * iterator adopts the same physical position (but the logical
821  * position will be offset to the previous UTF-8 character in the
822  * std::string object concerned). It will not throw provided
823  * assigning a std::string::const_iterator object does not throw, as
824  * it will not in any sane implementation.
825  * @param iter The iterator.
826  * @return A reference to this Cgu::Utf8::ReverseIterator object after
827  * assignment.
828  *
829  * Since 1.0.1
830  */
831  ReverseIterator& operator=(const Iterator& iter) {pos = iter.base(); cache = pos; return *this;}
832 
833 /**
834  * The dereference operator. Note that although this method is const,
835  * it is not thread safe for concurrent reads without external
836  * synchronization because it writes to an internal cache.
837  * @return A 32-bit gunichar object containing the whole unicode code
838  * point which is currently represented by this iterator. It will not
839  * throw.
840  *
841  * Since 1.0.1
842  */
844 
845 /**
846  * @return The current underlying std::string::const_iterator kept by
847  * this iterator. Once this iterator has been correctly initialized,
848  * that will point to the beginning of the UTF-8 character after the
849  * one currently represented by this iterator or to
850  * std::string::end(). It will not throw provided assigning a
851  * std::string::const_iterator object does not throw, as it will not
852  * in any sane implementation.
853  *
854  * Since 1.0.1
855  */
856  std::string::const_iterator base() const {return pos;}
857 
858 /**
859  * Constructs this iterator and initialises it with a
860  * std::string::const_reverse_iterator object. It should represent
861  * the beginning of a UTF-8 character (eg std::string::rbegin()) or
862  * comprise std::string::rend(). It will not throw provided that copy
863  * constructing a std::string::const_iterator object does not throw,
864  * as it will not in any sane implementation. This is a type
865  * conversion constructor (it is not marked explicit) so that it can
866  * be used with Cgu::Utf8::ReverseIterator comparison operators to
867  * compare the position of Cgu::Utf8::ReverseIterator with
868  * std::string::const_reverse_iterator objects.
869  * @param iter The const_reverse_iterator.
870  *
871  * Since 1.0.1
872  */
873  ReverseIterator(const std::string::const_reverse_iterator& iter): pos(iter.base()), cache(pos) {}
874 
875 /**
876  * Constructs this iterator and initialises it with a
877  * std::string::reverse_iterator object. It should represent the
878  * beginning of a UTF-8 character (eg std::string::rbegin()) or
879  * comprise std::string::rend(). It will not throw provided that copy
880  * constructing a std::string::const_iterator object does not throw,
881  * as it will not in any sane implementation. This is a type
882  * conversion constructor (it is not marked explicit) so that it can
883  * be used with Cgu::Utf8::ReverseIterator comparison operators to
884  * compare the position of Cgu::Utf8::ReverseIterator with
885  * std::string::reverse_iterator objects.
886  * @param iter The reverse_iterator.
887  *
888  * Since 1.0.1
889  */
890  ReverseIterator(const std::string::reverse_iterator& iter): pos(iter.base()), cache(pos) {}
891 
892 /**
893  * Constructs this iterator and initialises it with another
894  * Cgu::Utf8::ReverseIterator object. It will not throw provided that
895  * copy constructing a std::string::const_iterator object does not
896  * throw, as it will not in any sane implementation.
897  * @param iter The iterator.
898  *
899  * Since 1.0.1
900  */
901  ReverseIterator(const ReverseIterator& iter): pos(iter.pos), cache(iter.cache) {}
902 
903 /**
904  * Constructs this iterator and initialises it with a
905  * Cgu::Utf8::Iterator object, so that this iterator adopts the same
906  * physical position (but the logical position will be offset to the
907  * previous UTF-8 character in the std::string object concerned). It
908  * will not throw provided that copy constructing a
909  * std::string::const_iterator object does not throw, as it will not
910  * in any sane implementation.
911  * @param iter The iterator.
912  *
913  * Since 1.0.1
914  */
915  explicit ReverseIterator(const Iterator& iter): pos(iter.base()), cache(pos) {}
916 
917 /**
918  * The default constructor will not throw.
919  *
920  * Since 1.0.1
921  */
923 
924 /* Only has effect if --with-glib-memory-slices-compat or
925  * --with-glib-memory-slices-no-compat option picked */
927 };
928 
930 
931  if (pos > cache) pos = cache;
932 
933  else {
934  // we might be iterating from std::string::end()/std::string::rbegin() so
935  // we need to decrement before dereferencing and then increment again
936  const std::string::value_type* tmp = &(*(pos-1));
937  ++tmp;
938  pos -= tmp - g_utf8_prev_char(tmp);
939  }
940  return *this;
941 }
942 
944  ReverseIterator tmp(*this);
945  ++(*this);
946  return tmp;
947 }
948 
950  cache = pos;
951  const std::string::value_type* tmp = &(*pos);
952  // using g_utf8_next_char is safe even when pos points to the first character -
953  // that macro calls up the g_utf8_skip look-up table rather than attempting to
954  // read the following character, so we can safely iterate to std::string::rbegin()
955  pos += g_utf8_next_char(tmp) - tmp;
956  return *this;
957 }
958 
960  ReverseIterator tmp(*this);
961  --(*this);
962  return tmp;
963 }
964 
966  Iterator tmp(*this);
967  --tmp;
968  cache = tmp.base();
969  return g_utf8_get_char(&(*(tmp.base())));
970 }
971 
972 /**
973  * The comparison operators will not throw provided assigning a
974  * std::string::const_iterator object does not throw, as it will not
975  * in any sane implementation.
976  *
977  * Since 1.0.1
978  */
979 inline bool operator==(const ReverseIterator& iter1, const ReverseIterator& iter2) {
980  return (iter1.base() == iter2.base());
981 }
982 
983 /**
984  * The comparison operators will not throw provided assigning a
985  * std::string::const_iterator object does not throw, as it will not
986  * in any sane implementation.
987  *
988  * Since 1.0.1
989  */
990 inline bool operator!=(const ReverseIterator& iter1, const ReverseIterator& iter2) {
991  return (iter1.base() != iter2.base());
992 }
993 
994 /**
995  * The comparison operators will not throw provided assigning a
996  * std::string::const_iterator object does not throw, as it will not
997  * in any sane implementation. Ordering is viewed from the
998  * perspective of the logical operation (reverse iteration), so that
999  * for example an iterator at position std::string::rbegin() is less
1000  * than an iterator at position std::string::rend().
1001  *
1002  * Since 1.0.1
1003  */
1004 inline bool operator<(const ReverseIterator& iter1, const ReverseIterator& iter2) {
1005  return (iter1.base() > iter2.base());
1006 }
1007 
1008 /**
1009  * The comparison operators will not throw provided assigning a
1010  * std::string::const_iterator object does not throw, as it will not
1011  * in any sane implementation. Ordering is viewed from the
1012  * perspective of the logical operation (reverse iteration), so that
1013  * for example an iterator at position std::string::rbegin() is less
1014  * than an iterator at position std::string::rend().
1015  *
1016  * Since 1.0.1
1017  */
1018 inline bool operator<=(const ReverseIterator& iter1, const ReverseIterator& iter2) {
1019  return (iter1.base() >= iter2.base());
1020 }
1021 
1022 /**
1023  * The comparison operators will not throw provided assigning a
1024  * std::string::const_iterator object does not throw, as it will not
1025  * in any sane implementation. Ordering is viewed from the
1026  * perspective of the logical operation (reverse iteration), so that
1027  * for example an iterator at position std::string::rbegin() is less
1028  * than an iterator at position std::string::rend().
1029  *
1030  * Since 1.0.1
1031  */
1032 inline bool operator>(const ReverseIterator& iter1, const ReverseIterator& iter2) {
1033  return (iter1.base() < iter2.base());
1034 }
1035 
1036 /**
1037  * The comparison operators will not throw provided assigning a
1038  * std::string::const_iterator object does not throw, as it will not
1039  * in any sane implementation. Ordering is viewed from the
1040  * perspective of the logical operation (reverse iteration), so that
1041  * for example an iterator at position std::string::rbegin() is less
1042  * than an iterator at position std::string::rend().
1043  *
1044  * Since 1.0.1
1045  */
1046 inline bool operator>=(const ReverseIterator& iter1, const ReverseIterator& iter2) {
1047  return (iter1.base() <= iter2.base());
1048 }
1049 
1050 /*** Iterator class methods which require ReverseIterator as a complete type ***/
1051 
1053  pos = iter.base();
1054  return *this;
1055 }
1056 
1057 inline Iterator::Iterator(const ReverseIterator& iter): pos(iter.base()) {}
1058 
1059 } // namespace Utf8
1060 
1061 } // namespace Cgu
1062 
1063 #endif
Cgu::Utf8::ReverseIterator::reference
gunichar reference
Definition: convert.h:715
Cgu::Utf8::operator>=
bool operator>=(const Iterator &iter1, const Iterator &iter2)
Definition: convert.h:658
Cgu::Utf8::ReverseIterator::ReverseIterator
ReverseIterator(const std::string::reverse_iterator &iter)
Definition: convert.h:890
Cgu::Utf8::wide_to_utf8
std::string wide_to_utf8(const std::wstring &input)
Cgu::Utf8::ReverseIterator::base
std::string::const_iterator base() const
Definition: convert.h:856
Cgu
Definition: application.h:45
Cgu::Utf8::ReverseIterator::operator=
ReverseIterator & operator=(const Iterator &iter)
Definition: convert.h:831
Cgu::Utf8::Iterator::reference
gunichar reference
Definition: convert.h:367
Cgu::Utf8::ConversionError::ConversionError
ConversionError(const ConversionError &e)
Definition: convert.h:98
Cgu::Utf8::ConversionError::~ConversionError
~ConversionError()
Definition: convert.h:103
Cgu::Utf8::ReverseIterator::operator=
ReverseIterator & operator=(const std::string::reverse_iterator &iter)
Definition: convert.h:804
Cgu::Utf8::ConversionError::ConversionError
ConversionError(const char *msg)
Definition: convert.h:99
Cgu::Utf8::Iterator::Iterator
Iterator(const Iterator &iter)
Definition: convert.h:539
Cgu::Utf8::Iterator::Iterator
Iterator()
Definition: convert.h:559
Cgu::Utf8::ReverseIterator::difference_type
std::string::difference_type difference_type
Definition: convert.h:717
Cgu::Utf8::ReverseIterator::ReverseIterator
ReverseIterator(const std::string::const_reverse_iterator &iter)
Definition: convert.h:873
Cgu::Utf8::validate
bool validate(const std::string &text)
Definition: convert.h:278
Cgu::Utf8::ReverseIterator
A class which will iterate in reverse through a std::string object by reference to unicode characters...
Definition: convert.h:712
Cgu::Utf8::locale_from_utf8
std::string locale_from_utf8(const std::string &input)
Cgu::Utf8::locale_to_utf8
std::string locale_to_utf8(const std::string &input)
Cgu::Utf8::ReverseIterator::operator=
ReverseIterator & operator=(const std::string::const_reverse_iterator &iter)
Definition: convert.h:790
Cgu::Utf8::ReverseIterator::operator--
ReverseIterator & operator--()
Definition: convert.h:949
Cgu::Utf8::operator==
bool operator==(const Iterator &iter1, const Iterator &iter2)
Definition: convert.h:603
Cgu::Utf8::ConversionError::operator=
ConversionError & operator=(const ConversionError &e)
Definition: convert.h:93
Cgu::Utf8::Iterator::pointer
void pointer
Definition: convert.h:368
Cgu::Utf8::Iterator::operator=
Iterator & operator=(const std::string::const_iterator &iter)
Definition: convert.h:432
Cgu::Utf8::ReverseIterator::ReverseIterator
ReverseIterator(const ReverseIterator &iter)
Definition: convert.h:901
Cgu::Utf8::ConversionError
Definition: convert.h:88
Cgu::Utf8::operator<
bool operator<(const Iterator &iter1, const Iterator &iter2)
Definition: convert.h:625
Cgu::Utf8::Iterator::operator=
Iterator & operator=(const Iterator &iter)
Definition: convert.h:458
Cgu::Utf8::Iterator::difference_type
std::string::difference_type difference_type
Definition: convert.h:369
Cgu::Utf8::wide_from_utf8
std::wstring wide_from_utf8(const std::string &input)
Cgu::Utf8::Iterator::Iterator
Iterator(const std::string::const_iterator &iter)
Definition: convert.h:511
Cgu::Utf8::Iterator::operator*
Iterator::value_type operator*() const
Definition: convert.h:482
Cgu::Utf8::ReverseIterator::operator=
ReverseIterator & operator=(const ReverseIterator &iter)
Definition: convert.h:816
Cgu::Utf8::uniwide_to_utf8
std::string uniwide_to_utf8(const std::wstring &input)
Cgu::Utf8::ReverseIterator::operator*
ReverseIterator::value_type operator*() const
Definition: convert.h:965
CGU_GLIB_MEMORY_SLICES_FUNCS
#define CGU_GLIB_MEMORY_SLICES_FUNCS
Definition: cgu_config.h:84
Cgu::Utf8::ReverseIterator::iterator_category
std::bidirectional_iterator_tag iterator_category
Definition: convert.h:718
Cgu::Utf8::ReverseIterator::ReverseIterator
ReverseIterator()
Definition: convert.h:922
Cgu::Utf8::ReverseIterator::value_type
gunichar value_type
Definition: convert.h:714
Cgu::Utf8::Iterator::operator++
Iterator & operator++()
Definition: convert.h:566
Cgu::Utf8::ReverseIterator::ReverseIterator
ReverseIterator(const Iterator &iter)
Definition: convert.h:915
Cgu::Utf8::operator>
bool operator>(const Iterator &iter1, const Iterator &iter2)
Definition: convert.h:647
Cgu::Utf8::Iterator::base
std::string::const_iterator base() const
Definition: convert.h:494
Cgu::Utf8::filename_to_utf8
std::string filename_to_utf8(const std::string &input)
Cgu::Utf8::operator<=
bool operator<=(const Iterator &iter1, const Iterator &iter2)
Definition: convert.h:636
Cgu::Utf8::Iterator::value_type
gunichar value_type
Definition: convert.h:366
Cgu::Utf8::ConversionError::ConversionError
ConversionError(GError *error)
Definition: convert.h:101
Cgu::Utf8::filename_from_utf8
std::string filename_from_utf8(const std::string &input)
Cgu::Utf8::operator!=
bool operator!=(const Iterator &iter1, const Iterator &iter2)
Definition: convert.h:614
Cgu::Utf8::uniwide_from_utf8
std::wstring uniwide_from_utf8(const std::string &input)
Cgu::Utf8::Iterator
A class which will iterate through a std::string object by reference to unicode characters rather tha...
Definition: convert.h:364
Cgu::Utf8::ConversionError::what
virtual const char * what() const
Definition: convert.h:91
Cgu::Utf8::Iterator::iterator_category
std::bidirectional_iterator_tag iterator_category
Definition: convert.h:370
Cgu::Utf8::Iterator::operator=
Iterator & operator=(const std::string::iterator &iter)
Definition: convert.h:446
Cgu::Utf8::ReverseIterator::operator++
ReverseIterator & operator++()
Definition: convert.h:929
Cgu::Utf8::Iterator::Iterator
Iterator(const std::string::iterator &iter)
Definition: convert.h:528
Cgu::Utf8::ReverseIterator::pointer
void pointer
Definition: convert.h:716
cgu_config.h
Cgu::Utf8::Iterator::operator--
Iterator & operator--()
Definition: convert.h:581