Slicing mseed files above 2 GB size

seismo2112 · August 24, 2024, 4:12pm

Hi I am trying to slice the mseed files, which is almost 4 GB. But this error is coming back to back.

import obspy
from obspy import UTCDateTime
import os

# Path to your MiniSEED file
file_path = "/home/TSIMP_Data_2022/D110_JAN_MAR.mseed"

# Base output directory
base_output_dir = "/home/TSIMP_Data_2022/"

# Read the MiniSEED file
try:
    mseed_file = obspy.read(file_path)
except Exception as e:
    print(f"Error reading MiniSEED file: {e}")
    raise

# Get the start and end times
start = min(tr.stats.starttime for tr in mseed_file)
end = max(tr.stats.endtime for tr in mseed_file)

# Create a function to format and write each daily segment
def write_daily_segment(start_time, end_time, traces, output_dir):
    try:
        # Slice and write each trace
        for tr in traces:
            trace_slice = tr.slice(starttime=start_time, endtime=end_time)
            
            # Construct filename
            network = trace_slice.stats.network
            station = trace_slice.stats.station
            location = trace_slice.stats.location or "--"
            channel = trace_slice.stats.channel
            year = start_time.year
            julian_day = start_time.julday
            filename = f"{network}.{station}.{location}.{channel}.{year}.{julian_day:03d}.mseed"
            output_path = os.path.join(output_dir, filename)
            
            # Ensure output directory exists
            os.makedirs(output_dir, exist_ok=True)
            
            # Write the slice to a file
            trace_slice.write(output_path, format='MSEED')
            print(f"Saved: {output_path}")
    except Exception as e:
        print(f"Error writing daily segment: {e}")

# Initialize the start of the first day
doy = start.julday
year = start.year
t1 = UTCDateTime(f'{year:04d}-{doy:03d}T00:00:00')

while t1 < end:
    # Calculate the end of the current day
    t2 = UTCDateTime(f'{year:04d}-{doy:03d}T23:59:59')

    # Ensure t2 does not exceed the actual end time
    if t2 > end:
        t2 = end

    # Define output directory
    day_output_dir = os.path.join(base_output_dir, f"{year}_{doy:03d}")
    
    # Write the daily segment
    write_daily_segment(t1, t2, mseed_file, day_output_dir)

    # Move to the next day
    t1 = UTCDateTime(f'{year:04d}-{doy:03d}T00:00:00') + 86400  # 24 hours later
    
    # Increment day and handle year transition
    doy += 1
    if doy > 365:  # Handle leap years if necessary
        doy = 1
        year += 1
        # For leap years, check if year is a leap year
        if (year % 4 == 0 and year % 100 != 0) or (year % 400 == 0):
            if doy > 366:
                doy = 1
                year += 1

---------------------------------------------------------------------------
ObsPyMSEEDFilesizeTooLargeError           Traceback (most recent call last)
Cell In[2], line 13
     11 # Read the MiniSEED file
     12 try:
---> 13     mseed_file = obspy.read(file_path)
     14 except Exception as e:
     15     print(f"Error reading MiniSEED file: {e}")

File ~/.conda/envs/wmsan/lib/python3.12/site-packages/decorator.py:232, in decorate.<locals>.fun(*args, **kw)
    230 if not kwsyntax:
    231     args, kw = fix(args, kw, sig)
--> 232 return caller(func, *(extras + args), **kw)

File ~/.conda/envs/wmsan/lib/python3.12/site-packages/obspy/core/util/decorator.py:297, in map_example_filename.<locals>._map_example_filename(func, *args, **kwargs)
    295                 except IOError:
    296                     pass
--> 297 return func(*args, **kwargs)

File ~/.conda/envs/wmsan/lib/python3.12/site-packages/obspy/core/stream.py:208, in read(pathname_or_url, format, headonly, starttime, endtime, nearest_sample, dtype, apply_calib, check_compression, **kwargs)
    206     st = _create_example_stream(headonly=headonly)
    207 else:
--> 208     st = _generic_reader(pathname_or_url, _read, **kwargs)
    210 if len(st) == 0:
    211     if isinstance(pathname_or_url, Path):
...
--> 280     raise ObsPyMSEEDFilesizeTooLargeError(msg)
    282 info = util.get_record_information(mseed_object, endian=bo)
    284 # Map the encoding to a readable string value.

ObsPyMSEEDFilesizeTooLargeError: ObsPy can currently not directly read mini-SEED files that are larger than 2^31 bytes (2048 MiB). To still read it, please read the file in chunks as documented here: https://github.com/obspy/obspy/pull/1419#issuecomment-221582369
Output is truncated. View as a scrollable element or open in a text editor. Adjust cell output settings...

I have tried to use the chunk thing as well mentioned in the error but then it is not slicing the files into exact 24 hr files. Can you help me out where the issue is coming?

import os
import io
import obspy
from obspy import UTCDateTime

# Path to your large MiniSEED file
file_path = "/home/TSIMP_Data_2022/D110_JAN_MAR.mseed"

# Base output directory
base_output_dir = "/home/TSIMP_Data_2022/D100/"

# Define the record length and chunk size (e.g., 50 MB)
RECORD_LENGTH = 512  
CHUNK_SIZE_IN_MB = 50
CHUNK_SIZE = (CHUNK_SIZE_IN_MB * 1024 * 1024) // RECORD_LENGTH * RECORD_LENGTH  # Ensure it's multiple of record length

# Keep track of processed time windows to avoid duplicates
processed_segments = set()

with io.open(file_path, "rb") as fh:
    while True:
        # Read a chunk from the file
        data = fh.read(CHUNK_SIZE)
        if not data:
            break  # End of file reached
        
        # Use BytesIO to read from the chunk
        with io.BytesIO(data) as buf:
            try:
                # Read the stream from buffer
                st = obspy.read(buf)
            except Exception as e:
                print(f"Error reading chunk: {e}")
                continue  # Skip this chunk on error
            
            # Process each trace in the stream
            for trace in st:
                # Extract trace metadata
                network = trace.stats.network
                station = trace.stats.station
                location = trace.stats.location or "--"
                channel = trace.stats.channel
                start_time = trace.stats.starttime
                end_time = trace.stats.endtime
                
                # Define station-specific output directory
                station_output_dir = os.path.join(base_output_dir, station)
                os.makedirs(station_output_dir, exist_ok=True)
                
                # Calculate the start of the first 24-hour period
                day_start = UTCDateTime(year=start_time.year, julday=start_time.julday)
                
                # Iterate over 24-hour periods within the trace time span
                current_start = day_start
                while current_start < end_time:
                    current_end = current_start + 86400  # 24 hours later
                    
                    # Determine the overlap between trace and current 24-hour period
                    slice_start = max(current_start, start_time)
                    slice_end = min(current_end, end_time)
                    
                    # Skip if no overlap
                    if slice_start >= slice_end:
                        current_start += 86400
                        continue
                    
                    # Create a unique identifier for this segment to avoid duplicates
                    segment_id = (
                        network, station, location, channel,
                        slice_start.strftime("%Y.%j"), slice_end.strftime("%Y.%j")
                    )
                    if segment_id in processed_segments:
                        current_start += 86400
                        continue  # Skip already processed segments
                    processed_segments.add(segment_id)
                    
                    # Slice the trace for current 24-hour period
                    trace_slice = trace.slice(starttime=slice_start, endtime=slice_end)
                    
                    # Construct filename
                    year = slice_start.year
                    julian_day = slice_start.julday
                    filename = f"{network}.{station}.{location}.{channel}.{year}.{julian_day:03d}.mseed"
                    output_path = os.path.join(station_output_dir, filename)
                    
                    # Write the sliced trace to file
                    try:
                        trace_slice.write(output_path, format='MSEED')
                        print(f"Saved: {output_path}")
                    except Exception as e:
                        print(f"Error saving {output_path}: {e}")
                    
                    # Move to the next 24-hour period
                    current_start += 86400

megies · August 29, 2024, 7:33am

You could try to specify start- and endtimes of portions you want to read. Sounds like that might be what you want?

for doy in range(1, 366):
    t1 = UTCDateTime(2022, julday=doy)
    t2 = UTCDateTime(2022, julday=doy + 1)
    st = read(..., starttime=t1, endtime=t2)

see obspy.io.mseed.core._read_mseed — ObsPy 1.4.1.post0+334.gdb0be6b1e5.obspy.master documentation

seismo2112 · August 29, 2024, 7:49am

Thanks a lot. i’ll try it out.