# Standard Python libraries
import sys
import pandas as pd
import numpy as np
#Plotting
import matplotlib.pyplot as plt
# For reading segy data
from obspy.io.segy.segy import _read_segy
#For writing segy data
from obspy import read, Trace, Stream, UTCDateTime
from obspy.core import AttribDict
from obspy.io.segy.segy import SEGYTraceHeader, SEGYBinaryFileHeader
A. Read in text data
# Set the filename
= open("userdata/P-impedance_2400-3000ms.gslib",'r')
f
#Read in all the lines in the file and save them to a variable
= f.readlines()
mylist
#Close the file
f.close()print("Done reading file. Number of lines:", len(mylist))
Done reading file. Number of lines: 3957609
#Have a look at the top of the data
for line in mylist[0:11]:
print(line)
PETREL: Properties
7
i_index unit1 scale1
j_index unit1 scale1
k_index unit1 scale1
x_coord unit1 scale1
y_coord unit1 scale1
z_coord unit1 scale1
P-Impedance unit1 scale1
127 1 300 438131.65314303 6475378.74871708 -2999.00000000 9556.294922
128 1 300 438181.65314303 6475378.74871708 -2999.00000000 9627.205078
# A few ways to map this to a useful python format. But the simplest may be the following
= []
data
#Skip the header rows
for line in mylist[9:]:
= line.split()
linesplit
= int(linesplit[0])
i = int(linesplit[1])
j = int(linesplit[2])
k = float(linesplit[3])
x = float(linesplit[4])
y = float(linesplit[5])
z = float(linesplit[6])
p
data.append([i,j,k,x,y,z,p])
#Put the list in a dataframe
=pd.DataFrame(data)
df
#Then free up some memory (because this is a fairly big chunk of data)
=None
data
#Set the names of the columns of the dataframe
=['i','j','k','x','y','z','p']
df.columns
df
i | j | k | x | y | z | p | |
---|---|---|---|---|---|---|---|
0 | 127 | 1 | 300 | 438131.653143 | 6.475379e+06 | -2999.0 | 9556.294922 |
1 | 128 | 1 | 300 | 438181.653143 | 6.475379e+06 | -2999.0 | 9627.205078 |
2 | 129 | 1 | 300 | 438231.653143 | 6.475379e+06 | -2999.0 | 9555.066406 |
3 | 130 | 1 | 300 | 438281.653143 | 6.475379e+06 | -2999.0 | 9468.100586 |
4 | 123 | 2 | 300 | 437931.653143 | 6.475429e+06 | -2999.0 | 9601.517578 |
... | ... | ... | ... | ... | ... | ... | ... |
3957595 | 26 | 127 | 1 | 433081.653143 | 6.481679e+06 | -2401.0 | 5848.801758 |
3957596 | 27 | 127 | 1 | 433131.653143 | 6.481679e+06 | -2401.0 | 5924.203125 |
3957597 | 28 | 127 | 1 | 433181.653143 | 6.481679e+06 | -2401.0 | 6037.129883 |
3957598 | 29 | 127 | 1 | 433231.653143 | 6.481679e+06 | -2401.0 | 5978.708984 |
3957599 | 25 | 128 | 1 | 433031.653143 | 6.481729e+06 | -2401.0 | 5812.972168 |
3957600 rows × 7 columns
#Plot a single trace to see everything looks okay
= df[(df.i==127) & (df.j==1)].p
one_trace
=(16,2))
plt.figure(figsize
plt.plot(one_trace) plt.show()
plt.plot(df.z)
B. Read in equivalent segy data.
Get the segy header information from here, and just see what the goal is for the conversion.
= None
stream = _read_segy("userdata/P-impedance_2400-3000ms.sgy", headonly=True)
stream print(np.shape(stream.traces))
stream
(216540,)
216540 traces in the SEG Y structure.
=[]
arrsfor index, one_trace in enumerate(stream.traces):
= one_trace.data
p = len(p)
lenp = np.full((lenp), index,dtype=int)
i = np.full((lenp), one_trace.header.ensemble_number,dtype=int)
j = np.arange(lenp,0,-1,dtype=int)
k = np.full((lenp), one_trace.header.source_coordinate_x)
x = np.full((lenp), one_trace.header.source_coordinate_y)
y = np.full((lenp), one_trace.header.original_field_record_number)
z
= np.c_[i,j,k,x,y,z,p]
temparr
arrs.append(temparr)
= np.vstack(arrs) arrs
-1] arrs[
array([2.16539000e+05, 2.50000000e+03, 1.00000000e+00, 4.32975000e+05,
6.48179200e+06, 1.03610000e+04, 9.38641016e+03])
="PETREL: Properties\n7\n\ni_index unit1 scale1\nj_index unit1 scale1\nk_index unit1 scale1\nx_coord unit1 scale1\ny_coord unit1 scale1\nz_coord unit1 scale1\nP-Impedance unit1 scale1\n"
header "data.txt", arrs,header=header, comments='', fmt="%i %i %i %i %i %i %f") np.savetxt(
= stream.traces[100]
one_trace
#Print out details single trace
print(one_trace)
#Plot a single trace to see everything looks okay
=(16,2))
plt.figure(figsize
plt.plot(one_trace.data) plt.show()
Trace sequence number within line: 0
151 samples, dtype=float32, 250.00 Hz
#Here is the header information - for one trace
0].header stream.traces[
trace_sequence_number_within_line: 0
trace_sequence_number_within_segy_file: 0
original_field_record_number: 9961
trace_number_within_the_original_field_record: 0
energy_source_point_number: 0
ensemble_number: 1961
trace_number_within_the_ensemble: 0
trace_identification_code: 0
number_of_vertically_summed_traces_yielding_this_trace: 0
number_of_horizontally_stacked_traces_yielding_this_trace: 0
data_use: 0
distance_from_center_of_the_source_point_to_the_center_of_the_receiver_group: 0
receiver_group_elevation: 0
surface_elevation_at_source: 0
source_depth_below_surface: 0
datum_elevation_at_receiver_group: 0
datum_elevation_at_source: 0
water_depth_at_source: 0
water_depth_at_group: 0
scalar_to_be_applied_to_all_elevations_and_depths: 0
scalar_to_be_applied_to_all_coordinates: 0
source_coordinate_x: 438302
source_coordinate_y: 6475310
group_coordinate_x: 0
group_coordinate_y: 0
coordinate_units: 0
weathering_velocity: 0
subweathering_velocity: 0
uphole_time_at_source_in_ms: 0
uphole_time_at_group_in_ms: 0
source_static_correction_in_ms: 0
group_static_correction_in_ms: 0
total_static_applied_in_ms: 0
lag_time_A: 2400
lag_time_B: 3000
delay_recording_time: 0
mute_time_start_time_in_ms: 0
mute_time_end_time_in_ms: 0
number_of_samples_in_this_trace: 151
sample_interval_in_ms_for_this_trace: 4000
gain_type_of_field_instruments: 0
instrument_gain_constant: 0
instrument_early_or_initial_gain: 0
correlated: 0
sweep_frequency_at_start: 0
sweep_frequency_at_end: 0
sweep_length_in_ms: 0
sweep_type: 0
sweep_trace_taper_length_at_start_in_ms: 0
sweep_trace_taper_length_at_end_in_ms: 0
taper_type: 0
alias_filter_frequency: 0
alias_filter_slope: 0
notch_filter_frequency: 0
notch_filter_slope: 0
low_cut_frequency: 0
high_cut_frequency: 0
low_cut_slope: 0
high_cut_slope: 0
year_data_recorded: 0
day_of_year: 0
hour_of_day: 0
minute_of_hour: 0
second_of_minute: 0
time_basis_code: 0
trace_weighting_factor: 0
geophone_group_number_of_roll_switch_position_one: 0
geophone_group_number_of_trace_number_one: 0
geophone_group_number_of_last_trace: 0
gap_size: 0
over_travel_associated_with_taper: 0
x_coordinate_of_ensemble_position_of_this_trace: 0
y_coordinate_of_ensemble_position_of_this_trace: 0
for_3d_poststack_data_this_field_is_for_in_line_number: 0
for_3d_poststack_data_this_field_is_for_cross_line_number: 0
shotpoint_number: 0
scalar_to_be_applied_to_the_shotpoint_number: 0
trace_value_measurement_unit: 0
transduction_constant_mantissa: 0
transduction_constant_exponent: 0
transduction_units: 0
device_trace_identifier: 0
scalar_to_be_applied_to_times: 0
source_type_orientation: 0
source_energy_direction_mantissa: 0
source_energy_direction_exponent: 0
source_measurement_mantissa: 0
source_measurement_exponent: 0
source_measurement_unit: 0
C. Write out the text data as segy
# Group all the text traces by their the i-j coordinates
=df.groupby(['i','j'])
groupsprint(len(groups))
#Here I notice there are only 13192 traces in the text data, compared with the 216540 in the segy data...
13192
%%time
#Make a stream object (flush it out to begin because we have used this variable names for demos)
= None
stream_out = Stream()
stream_out
#not sure how to group the trace ensembles but can use a counter to keep track of them
= 0
ensemble_number
for ids,df_trace in groups:
#ids are the i, j coordinate locations
#trc is the subset of the full dataframe for just that i-j location
#For each i-j location, a trace is impdence at all the depth values, i.e.
= df_trace.p.values
data
# Enforce correct byte number and set to the Trace object
= np.require(data, dtype=np.float32)
data = Trace(data=data)
trace
# Set all the segy header information
# Attributes in trace.stats will overwrite everything in trace.stats.segy.trace_header
= 0.01
trace.stats.delta = UTCDateTime(1970,1,1,0,0,0)
trace.stats.starttime
# If you want to set some additional attributes in the trace header,
# add one and only set the attributes you want to be set. Otherwise the
# header will be created for you with default values.
if not hasattr(trace.stats, 'segy.trace_header'):
= {}
trace.stats.segy
= SEGYTraceHeader()
trace.stats.segy.trace_header
# trace.stats.segy.trace_header.trace_sequence_number_within_line = index + 1
# trace.stats.segy.trace_header.receiver_group_elevation = 0
= int(df_trace.x.values[0])
trace.stats.segy.trace_header.source_coordinate_x = int(df_trace.y.values[0])
trace.stats.segy.trace_header.source_coordinate_y = ensemble_number #Not sure how this is actually determined
trace.stats.segy.trace_header.ensemble_number = 2400
trace.stats.segy.trace_header.lag_time_A = 3000
trace.stats.segy.trace_header.lag_time_B = len(data)
trace.stats.segy.trace_header.number_of_samples_in_this_trace
+=1
ensemble_number
# Add trace to stream
stream_out.append(trace)
# A SEGY file has file wide headers. This can be attached to the stream
# object. If these are not set, they will be autocreated with default
# values.
= AttribDict()
stream_out.stats = 'Textual Header!'
stream_out.stats.textual_file_header = SEGYBinaryFileHeader()
stream_out.stats.binary_file_header = 5
stream_out.stats.binary_file_header.trace_sorting_code # stream.stats.binary_file_header.number_of_data_traces_per_ensemble=1
print("Stream object before writing...")
print(stream_out)
"TEST.sgy", format="SEGY", data_encoding=1, byteorder=sys.byteorder)
stream_out.write(
print("Stream object after writing. Will have some segy attributes...")
print(stream_out)
Stream object before writing...
13192 Trace(s) in Stream:
... | 1970-01-01T00:00:00.000000Z - 1970-01-01T00:00:02.990000Z | 100.0 Hz, 300 samples
...
(13190 other traces)
...
... | 1970-01-01T00:00:00.000000Z - 1970-01-01T00:00:02.990000Z | 100.0 Hz, 300 samples
[Use "print(Stream.__str__(extended=True))" to print all Traces]
Stream object after writing. Will have some segy attributes...
13192 Trace(s) in Stream:
Seq. No. in line: 0 | 1970-01-01T00:00:00.000000Z - 1970-01-01T00:00:02.990000Z | 100.0 Hz, 300 samples
...
(13190 other traces)
...
Seq. No. in line: 0 | 1970-01-01T00:00:00.000000Z - 1970-01-01T00:00:02.990000Z | 100.0 Hz, 300 samples
[Use "print(Stream.__str__(extended=True))" to print all Traces]
CPU times: total: 7.55 s
Wall time: 7.52 s
#Now check it
print("Reading using obspy.io.segy...")
= _read_segy("TEST.sgy")
st1 print(st1)
print(np.shape(st1.traces))
st1
Reading using obspy.io.segy...
13192 traces in the SEG Y structure.
(13192,)
13192 traces in the SEG Y structure.
= st1.traces[0]
one_trace
#Print out details single trace
print(one_trace)
#Plot a single trace
=(16,2))
plt.figure(figsize
plt.plot(one_trace.data) plt.show()
Trace sequence number within line: 0
300 samples, dtype=float32, 100.00 Hz
Bonus - why is there a data discrepency
I would have expected the traces from the segy data and from the gslib data to match up. But there are only 13192 traces in the text data and 216540 in the segy data. Something weird is going. I can plot these side by side and see if some data are missing.
=[]
xx=[]
yyfor i in range(len(stream.traces)):
xx.append(stream.traces[i].header.source_coordinate_x) yy.append(stream.traces[i].header.source_coordinate_y)
plt.scatter(xx,yy) plt.scatter(df.x,df.y)
<matplotlib.collections.PathCollection at 0x2d03be3b550>
At first glance this seems ok. The data coverage is over the same area, the edges are missing buuuuut zooming in we can see the text data is not as dense as the segy data. Problem solved.
=0.1)
plt.scatter(xx,yy,s=0.1)
plt.scatter(df.x,df.y,s
433000,434000])
plt.xlim([# plt.ylim([433000,434000])
(433000.0, 434000.0)
All materials copyright Sydney Informatics Hub, University of Sydney