Fast IO

struct Reader {
    begin: *const u8,
    cur: *const u8,
    end: *const u8,
    goff: usize,
    cap: usize,
}
impl Reader {
    fn new(capacity: usize) -> Self {
        let begin = unsafe { mmap(0, capacity, 3, 2, 0, 0) };
        Self {
            begin,
            end: unsafe { begin.add(capacity) },
            cur: begin,
            goff: 0,
            cap: capacity,
        }
    }
    fn try_refill(&mut self, readahead: usize) {
        if unsafe { self.cur.add(readahead) } < self.end {
            return;
        }
        self.goff += unsafe { self.cur.offset_from(self.begin) } as usize;
        let add = self.goff & 4095;
        self.goff &= !4095;
        self.begin = unsafe { mmap(self.begin as usize, self.cap, 3, 18, 0, self.goff as isize) };
        self.cur = unsafe { self.begin.add(add) };
        self.end = unsafe { self.begin.add(self.cap) };
    }
    fn u32(&mut self) -> u32 {
        let mut c = unsafe { self.cur.cast::<u64>().read_unaligned() };
        let m = !c & 0x1010101010101010;
        let len = m.trailing_zeros() >> 3;
        c <<= (8 - len) << 3;
        c = (c & 0x0F0F0F0F0F0F0F0F).wrapping_mul(2561) >> 8;
        c = (c & 0x00FF00FF00FF00FF).wrapping_mul(6553601) >> 16;
        c = (c & 0x0000FFFF0000FFFF).wrapping_mul(42949672960001) >> 32;
        self.cur = unsafe { self.cur.add(len as usize) };
        if len == 8 {
            if unsafe { self.cur.read() } & 0x10 != 0 {
                c *= 10;
                c += (unsafe { self.cur.read() } - b'0') as u64;
                self.cur = unsafe { self.cur.add(1) };
            }
            if unsafe { self.cur.read() } & 0x10 != 0 {
                c *= 10;
                c += (unsafe { self.cur.read() } - b'0') as u64;
                self.cur = unsafe { self.cur.add(1) };
            }
        }
        self.cur = unsafe { self.cur.add(1) };
        c as u32
    }
    fn i32(&mut self) -> i32 {
        let sign = unsafe { self.cur.read() } == b'-';
        (if sign {
            self.cur = unsafe { self.cur.add(1) };
            self.u32().wrapping_neg()
        } else {
            self.u32()
        }) as i32
    }
    fn u64(&mut self) -> u64 {
        let mut c = unsafe { self.cur.cast::<u64>().read_unaligned() };
        let m = !c & 0x1010101010101010;
        let len = m.trailing_zeros() >> 3;
        c <<= (8 - len) << 3;
        c = (c & 0x0F0F0F0F0F0F0F0F).wrapping_mul(2561) >> 8;
        c = (c & 0x00FF00FF00FF00FF).wrapping_mul(6553601) >> 16;
        c = (c & 0x0000FFFF0000FFFF).wrapping_mul(42949672960001) >> 32;
        self.cur = unsafe { self.cur.add(len as usize) };
        if len == 8 && unsafe { self.cur.read() } & 16 != 0 {
            let mut d = unsafe { self.cur.cast::<u64>().read_unaligned() };
            let m = !d & 0x1010101010101010;
            let len = m.trailing_zeros() >> 3;
            for _ in 0..len {
                c *= 10;
            }
            d <<= (8 - len) << 3;
            d = (d & 0x0F0F0F0F0F0F0F0F).wrapping_mul(2561) >> 8;
            d = (d & 0x00FF00FF00FF00FF).wrapping_mul(6553601) >> 16;
            d = (d & 0x0000FFFF0000FFFF).wrapping_mul(42949672960001) >> 32;
            c += d;
            self.cur = unsafe { self.cur.add(len as usize) };
            if len == 8 {
                if unsafe { self.cur.read() } & 0x10 != 0 {
                    c *= 10;
                    c += (unsafe { self.cur.read() } - b'0') as u64;
                    self.cur = unsafe { self.cur.add(1) };
                }
                if unsafe { self.cur.read() } & 0x10 != 0 {
                    c *= 10;
                    c += (unsafe { self.cur.read() } - b'0') as u64;
                    self.cur = unsafe { self.cur.add(1) };
                }
                if unsafe { self.cur.read() } & 0x10 != 0 {
                    c *= 10;
                    c += (unsafe { self.cur.read() } - b'0') as u64;
                    self.cur = unsafe { self.cur.add(1) };
                }
                if unsafe { self.cur.read() } & 0x10 != 0 {
                    c *= 10;
                    c += (unsafe { self.cur.read() } - b'0') as u64;
                    self.cur = unsafe { self.cur.add(1) };
                }
            }
        }
        self.cur = unsafe { self.cur.add(1) };
        c
    }
    fn i64(&mut self) -> i64 {
        let sign = unsafe { self.cur.read() } == b'-';
        (if sign {
            self.cur = unsafe { self.cur.add(1) };
            self.u64().wrapping_neg()
        } else {
            self.u64()
        }) as i64
    }
    fn consume(&mut self, add: usize) {
        self.cur = unsafe { self.cur.add(add) };
    }
    fn until(&mut self, delim: u8, buf: &mut String) -> usize {
        #[target_feature(enable = "avx2,sse4.2")]
        unsafe fn memchr(s: &[u8], delim: u8) -> Option<usize> {
            s.iter().position(|&b| b == delim)
        }
        let mut total = 0;
        loop {
            let len = unsafe { self.end.offset_from(self.cur) } as usize;
            let range = unsafe { std::slice::from_raw_parts(self.cur, len) };
            if let Some(i) = unsafe { memchr(range, delim) } {
                unsafe { buf.as_mut_vec() }.extend_from_slice(&range[..i]);
                self.cur = unsafe { self.cur.add(i + 1) };
                break total + i;
            } else {
                unsafe { buf.as_mut_vec() }.extend_from_slice(&range);
                self.cur = self.end;
                self.try_refill(1);
                total += len;
            }
        }
    }
    fn remain(&self) -> &[u8] {
        let len = unsafe { self.end.offset_from(self.cur) } as usize;
        unsafe { std::slice::from_raw_parts(self.cur, len) }
    }
    fn discard(&mut self, until: u8) -> usize {
        let mut len = 0;
        #[target_feature(enable = "avx2")]
        unsafe fn index(s: &[u8], b: u8) -> Option<usize> {
            s.iter().position(|&c| c == b)
        }
        loop {
            let pos = unsafe { index(self.remain(), until) };
            if let Some(pos) = pos {
                len += pos;
                self.cur = unsafe { self.cur.add(pos + 1) };
                break len;
            }
            len += unsafe { self.end.offset_from(self.cur) } as usize;
            self.cur = self.end;
            self.try_refill(1);
        }
    }
}
struct Writer {
    buf: Vec<u8>,
    off: usize,
}
impl Drop for Writer {
    fn drop(&mut self) {
        self.flush();
    }
}
#[repr(align(16))]
struct B128([u8; 16]);
#[target_feature(enable = "avx2")]
unsafe fn cvt8(out: &mut B128, n: u32) -> usize {
    use std::arch::x86_64::*;
    let x = _mm_cvtsi32_si128(n as i32);
    let div_10000 = _mm_set1_epi32(0xd1b71759u32 as i32);
    let mul_10000_merge = _mm_set1_epi32(55536);
    let div_var = _mm_setr_epi16(
        8389,
        5243,
        13108,
        0x8000u16 as i16,
        8389,
        5243,
        13108,
        0x8000u16 as i16,
    );
    let shift_var = _mm_setr_epi16(
        1 << 7,
        1 << 11,
        1 << 13,
        (1 << 15) as i16,
        1 << 7,
        1 << 11,
        1 << 13,
        (1 << 15) as i16,
    );
    let mul_10 = _mm_set1_epi16(10);
    let ascii0 = _mm_set1_epi8(48);
    let x_div_10000 = _mm_srli_epi64::<45>(_mm_mul_epu32(x, div_10000));
    let y = _mm_add_epi32(x, _mm_mul_epu32(x_div_10000, mul_10000_merge));
    let t0 = _mm_slli_epi16::<2>(_mm_shuffle_epi32::<5>(_mm_unpacklo_epi16(y, y)));
    let t1 = _mm_mulhi_epu16(t0, div_var);
    let t2 = _mm_mulhi_epu16(t1, shift_var);
    let t3 = _mm_slli_epi64::<16>(t2);
    let t4 = _mm_mullo_epi16(t3, mul_10);
    let t5 = _mm_sub_epi16(t2, t4);
    let t6 = _mm_packus_epi16(_mm_setzero_si128(), t5);
    let mask = _mm_movemask_epi8(_mm_cmpeq_epi8(t6, _mm_setzero_si128()));
    let offset = (mask & !0x8000).trailing_ones() as usize;
    let ascii = _mm_add_epi8(t6, ascii0);
    _mm_store_si128(out.0.as_mut_ptr().cast(), ascii);
    offset
}
impl Writer {
    fn new(capacity: usize) -> Self {
        let mut buf = vec![];
        buf.reserve_exact(capacity);
        unsafe { buf.set_len(capacity) };
        Self { buf, off: 0 }
    }
    fn flush(&mut self) {
        unsafe { write(1, self.buf.as_ptr(), self.off) };
        self.off = 0;
    }
    fn try_flush(&mut self, readahead: usize) {
        if self.off + readahead > self.buf.len() {
            self.flush();
        }
    }
    fn byte(&mut self, b: u8) {
        self.try_flush(1);
        self.buf[self.off] = b;
        self.off += 1;
    }
    fn bytes(&mut self, s: &[u8]) {
        let mut i = 0;
        while i < s.len() {
            let rem = s[i..].len().min(self.buf[self.off..].len());
            self.buf[self.off..self.off + rem].copy_from_slice(&s[i..i + rem]);
            self.off += rem;
            i += rem;
            if self.off == self.buf.len() {
                self.flush();
            }
        }
    }
    fn i32(&mut self, n: i32) {
        if n < 0 {
            self.byte(b'-');
            self.u32((n as u32).wrapping_neg());
        } else {
            self.u32(n as u32);
        }
    }
    fn u32(&mut self, n: u32) {
        let mut b128 = B128([0u8; 16]);
        let mut off;
        if n >= 100_000_000 {
            self.try_flush(10);
            let mut hi = n / 100_000_000;
            let lo = n % 100_000_000;
            unsafe { cvt8(&mut b128, lo) };
            off = 8;
            off -= 1;
            b128.0[off] = (hi % 10) as u8 + b'0';
            if hi >= 10 {
                off -= 1;
                hi /= 10;
                b128.0[off] = hi as u8 + b'0';
            }
        } else {
            self.try_flush(8);
            off = unsafe { cvt8(&mut b128, n) };
        }
        let len = 16 - off;
        self.buf[self.off..self.off + len].copy_from_slice(&b128.0[off..]);
        self.off += len;
    }
    fn i64(&mut self, n: i64) {
        if n < 0 {
            self.byte(b'-');
            self.u64((n as u64).wrapping_neg());
        } else {
            self.u64(n as u64);
        }
    }
    fn u64(&mut self, n: u64) {
        let mut hi128 = B128([0u8; 16]);
        let mut lo128 = B128([0u8; 16]);
        let mut hioff;
        let looff;
        if n >= 10_000_000_000_000_000 {
            self.try_flush(20);
            let mut hi = (n / 10_000_000_000_000_000) as u32;
            let lo = n % 10_000_000_000_000_000;
            let lohi = (lo / 100_000_000) as u32;
            let lolo = (lo % 100_000_000) as u32;
            unsafe { cvt8(&mut hi128, lohi) };
            unsafe { cvt8(&mut lo128, lolo) };
            hioff = 8;
            looff = 8;
            hioff -= 1;
            hi128.0[hioff] = (hi % 10) as u8 + b'0';
            if hi >= 10 {
                hioff -= 1;
                hi /= 10;
                hi128.0[hioff] = (hi % 10) as u8 + b'0';
            }
            if hi >= 10 {
                hioff -= 1;
                hi /= 10;
                hi128.0[hioff] = (hi % 10) as u8 + b'0';
            }
            if hi >= 10 {
                hioff -= 1;
                hi /= 10;
                hi128.0[hioff] = hi as u8 + b'0';
            }
        } else if n >= 100_000_000 {
            self.try_flush(16);
            let hi = (n / 100_000_000) as u32;
            let lo = (n % 100_000_000) as u32;
            hioff = unsafe { cvt8(&mut hi128, hi) };
            unsafe { cvt8(&mut lo128, lo) };
            looff = 8;
        } else {
            self.try_flush(8);
            hioff = 16;
            looff = unsafe { cvt8(&mut lo128, n as u32) };
        }
        let len = 16 - hioff;
        self.buf[self.off..self.off + len].copy_from_slice(&hi128.0[hioff..]);
        self.off += len;
        let len = 16 - looff;
        self.buf[self.off..self.off + len].copy_from_slice(&lo128.0[looff..]);
        self.off += len;
    }
}