pyo3/types/
string.rs

1#[cfg(not(Py_LIMITED_API))]
2use crate::exceptions::PyUnicodeDecodeError;
3use crate::ffi_ptr_ext::FfiPtrExt;
4use crate::instance::Borrowed;
5use crate::py_result_ext::PyResultExt;
6use crate::types::bytes::PyBytesMethods;
7use crate::types::PyBytes;
8use crate::{ffi, Bound, Py, PyAny, PyResult, Python};
9use std::borrow::Cow;
10use std::ffi::{CStr, CString};
11use std::str;
12
13/// Represents raw data backing a Python `str`.
14///
15/// Python internally stores strings in various representations. This enumeration
16/// represents those variations.
17#[cfg(not(Py_LIMITED_API))]
18#[derive(Clone, Copy, Debug, PartialEq, Eq)]
19pub enum PyStringData<'a> {
20    /// UCS1 representation.
21    Ucs1(&'a [u8]),
22
23    /// UCS2 representation.
24    Ucs2(&'a [u16]),
25
26    /// UCS4 representation.
27    Ucs4(&'a [u32]),
28}
29
30#[cfg(not(Py_LIMITED_API))]
31impl<'a> PyStringData<'a> {
32    /// Obtain the raw bytes backing this instance as a [u8] slice.
33    pub fn as_bytes(&self) -> &[u8] {
34        match self {
35            Self::Ucs1(s) => s,
36            Self::Ucs2(s) => unsafe {
37                std::slice::from_raw_parts(s.as_ptr().cast(), s.len() * self.value_width_bytes())
38            },
39            Self::Ucs4(s) => unsafe {
40                std::slice::from_raw_parts(s.as_ptr().cast(), s.len() * self.value_width_bytes())
41            },
42        }
43    }
44
45    /// Size in bytes of each value/item in the underlying slice.
46    #[inline]
47    pub fn value_width_bytes(&self) -> usize {
48        match self {
49            Self::Ucs1(_) => 1,
50            Self::Ucs2(_) => 2,
51            Self::Ucs4(_) => 4,
52        }
53    }
54
55    /// Convert the raw data to a Rust string.
56    ///
57    /// For UCS-1 / UTF-8, returns a borrow into the original slice. For UCS-2 and UCS-4,
58    /// returns an owned string.
59    ///
60    /// Returns [PyUnicodeDecodeError] if the string data isn't valid in its purported
61    /// storage format. This should only occur for strings that were created via Python
62    /// C APIs that skip input validation (like `PyUnicode_FromKindAndData`) and should
63    /// never occur for strings that were created from Python code.
64    pub fn to_string(self, py: Python<'_>) -> PyResult<Cow<'a, str>> {
65        match self {
66            Self::Ucs1(data) => match str::from_utf8(data) {
67                Ok(s) => Ok(Cow::Borrowed(s)),
68                Err(e) => Err(PyUnicodeDecodeError::new_utf8(py, data, e)?.into()),
69            },
70            Self::Ucs2(data) => match String::from_utf16(data) {
71                Ok(s) => Ok(Cow::Owned(s)),
72                Err(e) => {
73                    let mut message = e.to_string().as_bytes().to_vec();
74                    message.push(0);
75
76                    Err(PyUnicodeDecodeError::new(
77                        py,
78                        ffi::c_str!("utf-16"),
79                        self.as_bytes(),
80                        0..self.as_bytes().len(),
81                        CStr::from_bytes_with_nul(&message).unwrap(),
82                    )?
83                    .into())
84                }
85            },
86            Self::Ucs4(data) => match data.iter().map(|&c| std::char::from_u32(c)).collect() {
87                Some(s) => Ok(Cow::Owned(s)),
88                None => Err(PyUnicodeDecodeError::new(
89                    py,
90                    ffi::c_str!("utf-32"),
91                    self.as_bytes(),
92                    0..self.as_bytes().len(),
93                    ffi::c_str!("error converting utf-32"),
94                )?
95                .into()),
96            },
97        }
98    }
99
100    /// Convert the raw data to a Rust string, possibly with data loss.
101    ///
102    /// Invalid code points will be replaced with `U+FFFD REPLACEMENT CHARACTER`.
103    ///
104    /// Returns a borrow into original data, when possible, or owned data otherwise.
105    ///
106    /// The return value of this function should only disagree with [Self::to_string]
107    /// when that method would error.
108    pub fn to_string_lossy(self) -> Cow<'a, str> {
109        match self {
110            Self::Ucs1(data) => String::from_utf8_lossy(data),
111            Self::Ucs2(data) => Cow::Owned(String::from_utf16_lossy(data)),
112            Self::Ucs4(data) => Cow::Owned(
113                data.iter()
114                    .map(|&c| std::char::from_u32(c).unwrap_or('\u{FFFD}'))
115                    .collect(),
116            ),
117        }
118    }
119}
120
121/// Represents a Python `string` (a Unicode string object).
122///
123/// Values of this type are accessed via PyO3's smart pointers, e.g. as
124/// [`Py<PyString>`][crate::Py] or [`Bound<'py, PyString>`][Bound].
125///
126/// For APIs available on `str` objects, see the [`PyStringMethods`] trait which is implemented for
127/// [`Bound<'py, PyString>`][Bound].
128///
129/// # Equality
130///
131/// For convenience, [`Bound<'py, PyString>`] implements [`PartialEq<str>`] to allow comparing the
132/// data in the Python string to a Rust UTF-8 string slice.
133///
134/// This is not always the most appropriate way to compare Python strings, as Python string
135/// subclasses may have different equality semantics. In situations where subclasses overriding
136/// equality might be relevant, use [`PyAnyMethods::eq`](crate::types::any::PyAnyMethods::eq), at
137/// cost of the additional overhead of a Python method call.
138///
139/// ```rust
140/// # use pyo3::prelude::*;
141/// use pyo3::types::PyString;
142///
143/// # Python::attach(|py| {
144/// let py_string = PyString::new(py, "foo");
145/// // via PartialEq<str>
146/// assert_eq!(py_string, "foo");
147///
148/// // via Python equality
149/// assert!(py_string.as_any().eq("foo").unwrap());
150/// # });
151/// ```
152#[repr(transparent)]
153pub struct PyString(PyAny);
154
155pyobject_native_type_core!(PyString, pyobject_native_static_type_object!(ffi::PyUnicode_Type), #checkfunction=ffi::PyUnicode_Check);
156
157impl PyString {
158    /// Creates a new Python string object.
159    ///
160    /// Panics if out of memory.
161    pub fn new<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> {
162        let ptr = s.as_ptr().cast();
163        let len = s.len() as ffi::Py_ssize_t;
164        unsafe {
165            ffi::PyUnicode_FromStringAndSize(ptr, len)
166                .assume_owned(py)
167                .cast_into_unchecked()
168        }
169    }
170
171    /// Intern the given string
172    ///
173    /// This will return a reference to the same Python string object if called repeatedly with the same string.
174    ///
175    /// Note that while this is more memory efficient than [`PyString::new`], it unconditionally allocates a
176    /// temporary Python string object and is thereby slower than [`PyString::new`].
177    ///
178    /// Panics if out of memory.
179    pub fn intern<'py>(py: Python<'py>, s: &str) -> Bound<'py, PyString> {
180        let ptr = s.as_ptr().cast();
181        let len = s.len() as ffi::Py_ssize_t;
182        unsafe {
183            let mut ob = ffi::PyUnicode_FromStringAndSize(ptr, len);
184            if !ob.is_null() {
185                ffi::PyUnicode_InternInPlace(&mut ob);
186            }
187            ob.assume_owned(py).cast_into_unchecked()
188        }
189    }
190
191    /// Attempts to create a Python string from a Python [bytes-like object].
192    ///
193    /// The `encoding` and `errors` parameters are optional:
194    /// - If `encoding` is `None`, the default encoding is used (UTF-8).
195    /// - If `errors` is `None`, the default error handling is used ("strict").
196    ///
197    /// See the [Python documentation on codecs] for more information.
198    ///
199    /// [bytes-like object]: (https://docs.python.org/3/glossary.html#term-bytes-like-object).
200    /// [Python documentation on codecs]: https://docs.python.org/3/library/codecs.html#standard-encodings
201    pub fn from_encoded_object<'py>(
202        src: &Bound<'py, PyAny>,
203        encoding: Option<&CStr>,
204        errors: Option<&CStr>,
205    ) -> PyResult<Bound<'py, PyString>> {
206        let encoding = encoding.map_or(std::ptr::null(), CStr::as_ptr);
207        let errors = errors.map_or(std::ptr::null(), CStr::as_ptr);
208        // Safety:
209        // - `src` is a valid Python object
210        // - `encoding` and `errors` are either null or valid C strings. `encoding` and `errors` are
211        //   documented as allowing null.
212        // - `ffi::PyUnicode_FromEncodedObject` returns a new `str` object, or sets an error.
213        unsafe {
214            ffi::PyUnicode_FromEncodedObject(src.as_ptr(), encoding, errors)
215                .assume_owned_or_err(src.py())
216                .cast_into_unchecked()
217        }
218    }
219
220    /// Deprecated form of `PyString::from_encoded_object`.
221    ///
222    /// This version took `&str` arguments for `encoding` and `errors`, which required a runtime
223    /// conversion to `CString` internally.
224    #[deprecated(
225        since = "0.25.0",
226        note = "replaced with to `PyString::from_encoded_object`"
227    )]
228    pub fn from_object<'py>(
229        src: &Bound<'py, PyAny>,
230        encoding: &str,
231        errors: &str,
232    ) -> PyResult<Bound<'py, PyString>> {
233        let encoding = CString::new(encoding)?;
234        let errors = CString::new(errors)?;
235        PyString::from_encoded_object(src, Some(&encoding), Some(&errors))
236    }
237}
238
239/// Implementation of functionality for [`PyString`].
240///
241/// These methods are defined for the `Bound<'py, PyString>` smart pointer, so to use method call
242/// syntax these methods are separated into a trait, because stable Rust does not yet support
243/// `arbitrary_self_types`.
244#[doc(alias = "PyString")]
245pub trait PyStringMethods<'py>: crate::sealed::Sealed {
246    /// Gets the Python string as a Rust UTF-8 string slice.
247    ///
248    /// Returns a `UnicodeEncodeError` if the input is not valid unicode
249    /// (containing unpaired surrogates).
250    #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
251    fn to_str(&self) -> PyResult<&str>;
252
253    /// Converts the `PyString` into a Rust string, avoiding copying when possible.
254    ///
255    /// Returns a `UnicodeEncodeError` if the input is not valid unicode
256    /// (containing unpaired surrogates).
257    fn to_cow(&self) -> PyResult<Cow<'_, str>>;
258
259    /// Converts the `PyString` into a Rust string.
260    ///
261    /// Unpaired surrogates invalid UTF-8 sequences are
262    /// replaced with `U+FFFD REPLACEMENT CHARACTER`.
263    fn to_string_lossy(&self) -> Cow<'_, str>;
264
265    /// Encodes this string as a Python `bytes` object, using UTF-8 encoding.
266    fn encode_utf8(&self) -> PyResult<Bound<'py, PyBytes>>;
267
268    /// Obtains the raw data backing the Python string.
269    ///
270    /// If the Python string object was created through legacy APIs, its internal storage format
271    /// will be canonicalized before data is returned.
272    ///
273    /// # Safety
274    ///
275    /// This function implementation relies on manually decoding a C bitfield. In practice, this
276    /// works well on common little-endian architectures such as x86_64, where the bitfield has a
277    /// common representation (even if it is not part of the C spec). The PyO3 CI tests this API on
278    /// x86_64 platforms.
279    ///
280    /// By using this API, you accept responsibility for testing that PyStringData behaves as
281    /// expected on the targets where you plan to distribute your software.
282    #[cfg(not(any(Py_LIMITED_API, GraalPy, PyPy)))]
283    unsafe fn data(&self) -> PyResult<PyStringData<'_>>;
284}
285
286impl<'py> PyStringMethods<'py> for Bound<'py, PyString> {
287    #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
288    fn to_str(&self) -> PyResult<&str> {
289        self.as_borrowed().to_str()
290    }
291
292    fn to_cow(&self) -> PyResult<Cow<'_, str>> {
293        self.as_borrowed().to_cow()
294    }
295
296    fn to_string_lossy(&self) -> Cow<'_, str> {
297        self.as_borrowed().to_string_lossy()
298    }
299
300    fn encode_utf8(&self) -> PyResult<Bound<'py, PyBytes>> {
301        unsafe {
302            ffi::PyUnicode_AsUTF8String(self.as_ptr())
303                .assume_owned_or_err(self.py())
304                .cast_into_unchecked::<PyBytes>()
305        }
306    }
307
308    #[cfg(not(any(Py_LIMITED_API, GraalPy, PyPy)))]
309    unsafe fn data(&self) -> PyResult<PyStringData<'_>> {
310        unsafe { self.as_borrowed().data() }
311    }
312}
313
314impl<'a> Borrowed<'a, '_, PyString> {
315    #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
316    #[allow(clippy::wrong_self_convention)]
317    pub(crate) fn to_str(self) -> PyResult<&'a str> {
318        // PyUnicode_AsUTF8AndSize only available on limited API starting with 3.10.
319        let mut size: ffi::Py_ssize_t = 0;
320        let data: *const u8 =
321            unsafe { ffi::PyUnicode_AsUTF8AndSize(self.as_ptr(), &mut size).cast() };
322        if data.is_null() {
323            Err(crate::PyErr::fetch(self.py()))
324        } else {
325            Ok(unsafe {
326                std::str::from_utf8_unchecked(std::slice::from_raw_parts(data, size as usize))
327            })
328        }
329    }
330
331    #[allow(clippy::wrong_self_convention)]
332    pub(crate) fn to_cow(self) -> PyResult<Cow<'a, str>> {
333        // TODO: this method can probably be deprecated once Python 3.9 support is dropped,
334        // because all versions then support the more efficient `to_str`.
335        #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
336        {
337            self.to_str().map(Cow::Borrowed)
338        }
339
340        #[cfg(not(any(Py_3_10, not(Py_LIMITED_API))))]
341        {
342            let bytes = self.encode_utf8()?;
343            Ok(Cow::Owned(
344                unsafe { str::from_utf8_unchecked(bytes.as_bytes()) }.to_owned(),
345            ))
346        }
347    }
348
349    #[allow(clippy::wrong_self_convention)]
350    fn to_string_lossy(self) -> Cow<'a, str> {
351        let ptr = self.as_ptr();
352        let py = self.py();
353
354        #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
355        if let Ok(s) = self.to_str() {
356            return Cow::Borrowed(s);
357        }
358
359        let bytes = unsafe {
360            ffi::PyUnicode_AsEncodedString(
361                ptr,
362                ffi::c_str!("utf-8").as_ptr(),
363                ffi::c_str!("surrogatepass").as_ptr(),
364            )
365            .assume_owned(py)
366            .cast_into_unchecked::<PyBytes>()
367        };
368        Cow::Owned(String::from_utf8_lossy(bytes.as_bytes()).into_owned())
369    }
370
371    #[cfg(not(any(Py_LIMITED_API, GraalPy, PyPy)))]
372    unsafe fn data(self) -> PyResult<PyStringData<'a>> {
373        unsafe {
374            let ptr = self.as_ptr();
375
376            #[cfg(not(Py_3_12))]
377            #[allow(deprecated)]
378            {
379                let ready = ffi::PyUnicode_READY(ptr);
380                if ready != 0 {
381                    // Exception was created on failure.
382                    return Err(crate::PyErr::fetch(self.py()));
383                }
384            }
385
386            // The string should be in its canonical form after calling `PyUnicode_READY()`.
387            // And non-canonical form not possible after Python 3.12. So it should be safe
388            // to call these APIs.
389            let length = ffi::PyUnicode_GET_LENGTH(ptr) as usize;
390            let raw_data = ffi::PyUnicode_DATA(ptr);
391            let kind = ffi::PyUnicode_KIND(ptr);
392
393            match kind {
394                ffi::PyUnicode_1BYTE_KIND => Ok(PyStringData::Ucs1(std::slice::from_raw_parts(
395                    raw_data as *const u8,
396                    length,
397                ))),
398                ffi::PyUnicode_2BYTE_KIND => Ok(PyStringData::Ucs2(std::slice::from_raw_parts(
399                    raw_data as *const u16,
400                    length,
401                ))),
402                ffi::PyUnicode_4BYTE_KIND => Ok(PyStringData::Ucs4(std::slice::from_raw_parts(
403                    raw_data as *const u32,
404                    length,
405                ))),
406                _ => unreachable!(),
407            }
408        }
409    }
410}
411
412impl Py<PyString> {
413    /// Gets the Python string as a Rust UTF-8 string slice.
414    ///
415    /// Returns a `UnicodeEncodeError` if the input is not valid unicode
416    /// (containing unpaired surrogates).
417    ///
418    /// Because `str` objects are immutable, the returned slice is independent of
419    /// the GIL lifetime.
420    #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
421    pub fn to_str<'a>(&'a self, py: Python<'_>) -> PyResult<&'a str> {
422        self.bind_borrowed(py).to_str()
423    }
424
425    /// Converts the `PyString` into a Rust string, avoiding copying when possible.
426    ///
427    /// Returns a `UnicodeEncodeError` if the input is not valid unicode
428    /// (containing unpaired surrogates).
429    ///
430    /// Because `str` objects are immutable, the returned slice is independent of
431    /// the GIL lifetime.
432    pub fn to_cow<'a>(&'a self, py: Python<'_>) -> PyResult<Cow<'a, str>> {
433        self.bind_borrowed(py).to_cow()
434    }
435
436    /// Converts the `PyString` into a Rust string.
437    ///
438    /// Unpaired surrogates invalid UTF-8 sequences are
439    /// replaced with `U+FFFD REPLACEMENT CHARACTER`.
440    ///
441    /// Because `str` objects are immutable, the returned slice is independent of
442    /// the GIL lifetime.
443    pub fn to_string_lossy<'a>(&'a self, py: Python<'_>) -> Cow<'a, str> {
444        self.bind_borrowed(py).to_string_lossy()
445    }
446}
447
448/// Compares whether the data in the Python string is equal to the given UTF8.
449///
450/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
451impl PartialEq<str> for Bound<'_, PyString> {
452    #[inline]
453    fn eq(&self, other: &str) -> bool {
454        self.as_borrowed() == *other
455    }
456}
457
458/// Compares whether the data in the Python string is equal to the given UTF8.
459///
460/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
461impl PartialEq<&'_ str> for Bound<'_, PyString> {
462    #[inline]
463    fn eq(&self, other: &&str) -> bool {
464        self.as_borrowed() == **other
465    }
466}
467
468/// Compares whether the data in the Python string is equal to the given UTF8.
469///
470/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
471impl PartialEq<Bound<'_, PyString>> for str {
472    #[inline]
473    fn eq(&self, other: &Bound<'_, PyString>) -> bool {
474        *self == other.as_borrowed()
475    }
476}
477
478/// Compares whether the data in the Python string is equal to the given UTF8.
479///
480/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
481impl PartialEq<&'_ Bound<'_, PyString>> for str {
482    #[inline]
483    fn eq(&self, other: &&Bound<'_, PyString>) -> bool {
484        *self == other.as_borrowed()
485    }
486}
487
488/// Compares whether the data in the Python string is equal to the given UTF8.
489///
490/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
491impl PartialEq<Bound<'_, PyString>> for &'_ str {
492    #[inline]
493    fn eq(&self, other: &Bound<'_, PyString>) -> bool {
494        **self == other.as_borrowed()
495    }
496}
497
498/// Compares whether the data in the Python string is equal to the given UTF8.
499///
500/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
501impl PartialEq<str> for &'_ Bound<'_, PyString> {
502    #[inline]
503    fn eq(&self, other: &str) -> bool {
504        self.as_borrowed() == other
505    }
506}
507
508/// Compares whether the data in the Python string is equal to the given UTF8.
509///
510/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
511impl PartialEq<str> for Borrowed<'_, '_, PyString> {
512    #[inline]
513    fn eq(&self, other: &str) -> bool {
514        #[cfg(not(Py_3_13))]
515        {
516            self.to_cow().is_ok_and(|s| s == other)
517        }
518
519        #[cfg(Py_3_13)]
520        unsafe {
521            ffi::PyUnicode_EqualToUTF8AndSize(
522                self.as_ptr(),
523                other.as_ptr().cast(),
524                other.len() as _,
525            ) == 1
526        }
527    }
528}
529
530/// Compares whether the data in the Python string is equal to the given UTF8.
531///
532/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
533impl PartialEq<&str> for Borrowed<'_, '_, PyString> {
534    #[inline]
535    fn eq(&self, other: &&str) -> bool {
536        *self == **other
537    }
538}
539
540/// Compares whether the data in the Python string is equal to the given UTF8.
541///
542/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
543impl PartialEq<Borrowed<'_, '_, PyString>> for str {
544    #[inline]
545    fn eq(&self, other: &Borrowed<'_, '_, PyString>) -> bool {
546        other == self
547    }
548}
549
550/// Compares whether the data in the Python string is equal to the given UTF8.
551///
552/// In some cases Python equality might be more appropriate; see the note on [`PyString`].
553impl PartialEq<Borrowed<'_, '_, PyString>> for &'_ str {
554    #[inline]
555    fn eq(&self, other: &Borrowed<'_, '_, PyString>) -> bool {
556        other == self
557    }
558}
559
560#[cfg(test)]
561mod tests {
562    use pyo3_ffi::c_str;
563
564    use super::*;
565    use crate::{exceptions::PyLookupError, types::PyAnyMethods as _, IntoPyObject};
566
567    #[test]
568    fn test_to_cow_utf8() {
569        Python::attach(|py| {
570            let s = "ascii 🐈";
571            let py_string = PyString::new(py, s);
572            assert_eq!(s, py_string.to_cow().unwrap());
573        })
574    }
575
576    #[test]
577    fn test_to_cow_surrogate() {
578        Python::attach(|py| {
579            let py_string = py
580                .eval(ffi::c_str!(r"'\ud800'"), None, None)
581                .unwrap()
582                .cast_into::<PyString>()
583                .unwrap();
584            assert!(py_string.to_cow().is_err());
585        })
586    }
587
588    #[test]
589    fn test_to_cow_unicode() {
590        Python::attach(|py| {
591            let s = "哈哈🐈";
592            let py_string = PyString::new(py, s);
593            assert_eq!(s, py_string.to_cow().unwrap());
594        })
595    }
596
597    #[test]
598    fn test_encode_utf8_unicode() {
599        Python::attach(|py| {
600            let s = "哈哈🐈";
601            let obj = PyString::new(py, s);
602            assert_eq!(s.as_bytes(), obj.encode_utf8().unwrap().as_bytes());
603        })
604    }
605
606    #[test]
607    fn test_encode_utf8_surrogate() {
608        Python::attach(|py| {
609            let obj: Py<PyAny> = py
610                .eval(ffi::c_str!(r"'\ud800'"), None, None)
611                .unwrap()
612                .into();
613            assert!(obj
614                .bind(py)
615                .cast::<PyString>()
616                .unwrap()
617                .encode_utf8()
618                .is_err());
619        })
620    }
621
622    #[test]
623    fn test_to_string_lossy() {
624        Python::attach(|py| {
625            let py_string = py
626                .eval(ffi::c_str!(r"'🐈 Hello \ud800World'"), None, None)
627                .unwrap()
628                .cast_into::<PyString>()
629                .unwrap();
630
631            assert_eq!(py_string.to_string_lossy(), "🐈 Hello ���World");
632        })
633    }
634
635    #[test]
636    fn test_debug_string() {
637        Python::attach(|py| {
638            let s = "Hello\n".into_pyobject(py).unwrap();
639            assert_eq!(format!("{s:?}"), "'Hello\\n'");
640        })
641    }
642
643    #[test]
644    fn test_display_string() {
645        Python::attach(|py| {
646            let s = "Hello\n".into_pyobject(py).unwrap();
647            assert_eq!(format!("{s}"), "Hello\n");
648        })
649    }
650
651    #[test]
652    fn test_string_from_encoded_object() {
653        Python::attach(|py| {
654            let py_bytes = PyBytes::new(py, b"ab\xFFcd");
655
656            // default encoding is utf-8, default error handler is strict
657            let py_string = PyString::from_encoded_object(&py_bytes, None, None).unwrap_err();
658            assert!(py_string
659                .get_type(py)
660                .is(py.get_type::<crate::exceptions::PyUnicodeDecodeError>()));
661
662            // with `ignore` error handler, the invalid byte is dropped
663            let py_string =
664                PyString::from_encoded_object(&py_bytes, None, Some(c_str!("ignore"))).unwrap();
665
666            let result = py_string.to_cow().unwrap();
667            assert_eq!(result, "abcd");
668
669            #[allow(deprecated)]
670            let py_string = PyString::from_object(&py_bytes, "utf-8", "ignore").unwrap();
671
672            let result = py_string.to_cow().unwrap();
673            assert_eq!(result, "abcd");
674        });
675    }
676
677    #[test]
678    fn test_string_from_encoded_object_with_invalid_encoding_errors() {
679        Python::attach(|py| {
680            let py_bytes = PyBytes::new(py, b"abcd");
681
682            // invalid encoding
683            let err =
684                PyString::from_encoded_object(&py_bytes, Some(c_str!("wat")), None).unwrap_err();
685            assert!(err.is_instance(py, &py.get_type::<PyLookupError>()));
686            assert_eq!(err.to_string(), "LookupError: unknown encoding: wat");
687
688            // invalid error handler
689            let err = PyString::from_encoded_object(
690                &PyBytes::new(py, b"ab\xFFcd"),
691                None,
692                Some(c_str!("wat")),
693            )
694            .unwrap_err();
695            assert!(err.is_instance(py, &py.get_type::<PyLookupError>()));
696            assert_eq!(
697                err.to_string(),
698                "LookupError: unknown error handler name 'wat'"
699            );
700
701            #[allow(deprecated)]
702            let result = PyString::from_object(&py_bytes, "utf\0-8", "ignore");
703            assert!(result.is_err());
704
705            #[allow(deprecated)]
706            let result = PyString::from_object(&py_bytes, "utf-8", "ign\0ore");
707            assert!(result.is_err());
708        });
709    }
710
711    #[test]
712    #[cfg(not(any(Py_LIMITED_API, PyPy, GraalPy)))]
713    fn test_string_data_ucs1() {
714        Python::attach(|py| {
715            let s = PyString::new(py, "hello, world");
716            let data = unsafe { s.data().unwrap() };
717
718            assert_eq!(data, PyStringData::Ucs1(b"hello, world"));
719            assert_eq!(data.to_string(py).unwrap(), Cow::Borrowed("hello, world"));
720            assert_eq!(data.to_string_lossy(), Cow::Borrowed("hello, world"));
721        })
722    }
723
724    #[test]
725    #[cfg(not(any(Py_LIMITED_API, PyPy, GraalPy)))]
726    fn test_string_data_ucs1_invalid() {
727        Python::attach(|py| {
728            // 0xfe is not allowed in UTF-8.
729            let buffer = b"f\xfe\0";
730            let ptr = unsafe {
731                crate::ffi::PyUnicode_FromKindAndData(
732                    crate::ffi::PyUnicode_1BYTE_KIND as _,
733                    buffer.as_ptr().cast(),
734                    2,
735                )
736            };
737            assert!(!ptr.is_null());
738            let s = unsafe { ptr.assume_owned(py).cast_into_unchecked::<PyString>() };
739            let data = unsafe { s.data().unwrap() };
740            assert_eq!(data, PyStringData::Ucs1(b"f\xfe"));
741            let err = data.to_string(py).unwrap_err();
742            assert!(err.get_type(py).is(py.get_type::<PyUnicodeDecodeError>()));
743            assert!(err
744                .to_string()
745                .contains("'utf-8' codec can't decode byte 0xfe in position 1"));
746            assert_eq!(data.to_string_lossy(), Cow::Borrowed("f�"));
747        });
748    }
749
750    #[test]
751    #[cfg(not(any(Py_LIMITED_API, PyPy, GraalPy)))]
752    fn test_string_data_ucs2() {
753        Python::attach(|py| {
754            let s = py.eval(ffi::c_str!("'foo\\ud800'"), None, None).unwrap();
755            let py_string = s.cast::<PyString>().unwrap();
756            let data = unsafe { py_string.data().unwrap() };
757
758            assert_eq!(data, PyStringData::Ucs2(&[102, 111, 111, 0xd800]));
759            assert_eq!(
760                data.to_string_lossy(),
761                Cow::Owned::<str>("foo�".to_string())
762            );
763        })
764    }
765
766    #[test]
767    #[cfg(all(not(any(Py_LIMITED_API, PyPy, GraalPy)), target_endian = "little"))]
768    fn test_string_data_ucs2_invalid() {
769        Python::attach(|py| {
770            // U+FF22 (valid) & U+d800 (never valid)
771            let buffer = b"\x22\xff\x00\xd8\x00\x00";
772            let ptr = unsafe {
773                crate::ffi::PyUnicode_FromKindAndData(
774                    crate::ffi::PyUnicode_2BYTE_KIND as _,
775                    buffer.as_ptr().cast(),
776                    2,
777                )
778            };
779            assert!(!ptr.is_null());
780            let s = unsafe { ptr.assume_owned(py).cast_into_unchecked::<PyString>() };
781            let data = unsafe { s.data().unwrap() };
782            assert_eq!(data, PyStringData::Ucs2(&[0xff22, 0xd800]));
783            let err = data.to_string(py).unwrap_err();
784            assert!(err.get_type(py).is(py.get_type::<PyUnicodeDecodeError>()));
785            assert!(err
786                .to_string()
787                .contains("'utf-16' codec can't decode bytes in position 0-3"));
788            assert_eq!(data.to_string_lossy(), Cow::Owned::<str>("B�".into()));
789        });
790    }
791
792    #[test]
793    #[cfg(not(any(Py_LIMITED_API, PyPy, GraalPy)))]
794    fn test_string_data_ucs4() {
795        Python::attach(|py| {
796            let s = "哈哈🐈";
797            let py_string = PyString::new(py, s);
798            let data = unsafe { py_string.data().unwrap() };
799
800            assert_eq!(data, PyStringData::Ucs4(&[21704, 21704, 128008]));
801            assert_eq!(data.to_string_lossy(), Cow::Owned::<str>(s.to_string()));
802        })
803    }
804
805    #[test]
806    #[cfg(all(not(any(Py_LIMITED_API, PyPy, GraalPy)), target_endian = "little"))]
807    fn test_string_data_ucs4_invalid() {
808        Python::attach(|py| {
809            // U+20000 (valid) & U+d800 (never valid)
810            let buffer = b"\x00\x00\x02\x00\x00\xd8\x00\x00\x00\x00\x00\x00";
811            let ptr = unsafe {
812                crate::ffi::PyUnicode_FromKindAndData(
813                    crate::ffi::PyUnicode_4BYTE_KIND as _,
814                    buffer.as_ptr().cast(),
815                    2,
816                )
817            };
818            assert!(!ptr.is_null());
819            let s = unsafe { ptr.assume_owned(py).cast_into_unchecked::<PyString>() };
820            let data = unsafe { s.data().unwrap() };
821            assert_eq!(data, PyStringData::Ucs4(&[0x20000, 0xd800]));
822            let err = data.to_string(py).unwrap_err();
823            assert!(err.get_type(py).is(py.get_type::<PyUnicodeDecodeError>()));
824            assert!(err
825                .to_string()
826                .contains("'utf-32' codec can't decode bytes in position 0-7"));
827            assert_eq!(data.to_string_lossy(), Cow::Owned::<str>("𠀀�".into()));
828        });
829    }
830
831    #[test]
832    fn test_intern_string() {
833        Python::attach(|py| {
834            let py_string1 = PyString::intern(py, "foo");
835            assert_eq!(py_string1, "foo");
836
837            let py_string2 = PyString::intern(py, "foo");
838            assert_eq!(py_string2, "foo");
839
840            assert_eq!(py_string1.as_ptr(), py_string2.as_ptr());
841
842            let py_string3 = PyString::intern(py, "bar");
843            assert_eq!(py_string3, "bar");
844
845            assert_ne!(py_string1.as_ptr(), py_string3.as_ptr());
846        });
847    }
848
849    #[test]
850    fn test_py_to_str_utf8() {
851        Python::attach(|py| {
852            let s = "ascii 🐈";
853            let py_string = PyString::new(py, s).unbind();
854
855            #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
856            assert_eq!(s, py_string.to_str(py).unwrap());
857
858            assert_eq!(s, py_string.to_cow(py).unwrap());
859        })
860    }
861
862    #[test]
863    fn test_py_to_str_surrogate() {
864        Python::attach(|py| {
865            let py_string: Py<PyString> = py
866                .eval(ffi::c_str!(r"'\ud800'"), None, None)
867                .unwrap()
868                .extract()
869                .unwrap();
870
871            #[cfg(any(Py_3_10, not(Py_LIMITED_API)))]
872            assert!(py_string.to_str(py).is_err());
873
874            assert!(py_string.to_cow(py).is_err());
875        })
876    }
877
878    #[test]
879    fn test_py_to_string_lossy() {
880        Python::attach(|py| {
881            let py_string: Py<PyString> = py
882                .eval(ffi::c_str!(r"'🐈 Hello \ud800World'"), None, None)
883                .unwrap()
884                .extract()
885                .unwrap();
886            assert_eq!(py_string.to_string_lossy(py), "🐈 Hello ���World");
887        })
888    }
889
890    #[test]
891    fn test_comparisons() {
892        Python::attach(|py| {
893            let s = "hello, world";
894            let py_string = PyString::new(py, s);
895
896            assert_eq!(py_string, "hello, world");
897
898            assert_eq!(py_string, s);
899            assert_eq!(&py_string, s);
900            assert_eq!(s, py_string);
901            assert_eq!(s, &py_string);
902
903            assert_eq!(py_string, *s);
904            assert_eq!(&py_string, *s);
905            assert_eq!(*s, py_string);
906            assert_eq!(*s, &py_string);
907
908            let py_string = py_string.as_borrowed();
909
910            assert_eq!(py_string, s);
911            assert_eq!(&py_string, s);
912            assert_eq!(s, py_string);
913            assert_eq!(s, &py_string);
914
915            assert_eq!(py_string, *s);
916            assert_eq!(*s, py_string);
917        })
918    }
919}