Javier del Villar
05/10/2023, 1:45 PMSparkDataSet
. More details in thread.├── 01_raw
│ └── csv_folder
│ ├── file1.csv
│ ├── file2.csv
│ ├── file3.csv
│ ├── file4.csv
│ └── file5.csv
Catalog entry:
csv_data_local:
type: spark.SparkDataSet
filepath: data/01_raw/csv_folder
file_format: csv
load_args:
header: True
inferSchema: True
When I try to load that entry, I get the following error message:
DataSetError: Failed while loading data from data set SparkDataSet(file_format=csv,
filepath=C:/[REDACTED]/data/01_raw/csv_folder, load_args={'header': True, 'inferSchema': True}, save_args={}).
An error occurred while calling o52.load.
: java.lang.UnsatisfiedLinkError: org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Ljava/lang/String;I)Z
Just to troubleshoot I uploaded the folder to S3 and worked perfectly:
Catalog entry:
csv_data_s3:
type: spark.SparkDataSet
credentials: dev_s3
filepath: s3a://[REDACTED]/data/01_raw/csv_folder
file_format: csv
load_args:
header: True
inferSchema: True
I also tried loading a single file in my local PC, for example, filepath: data/01_raw/csv_folder/file1.csv
, and loaded the file correctly.
I have managed to replicate this error in other PCs including with other versions of windows and java.Ian Whalen
05/10/2023, 1:53 PMJavier del Villar
05/10/2023, 1:55 PMIan Whalen
05/10/2023, 1:58 PM├── 01_raw
│ └── csv_folder
│ ├── file1.csv
│ ├── file2.csv
│ ├── file3.csv
│ ├── file4.csv
│ └── file5.csv
Javier del Villar
05/10/2023, 1:58 PMIan Whalen
05/10/2023, 1:59 PMspark.read.csv
?Sajid Alam
05/10/2023, 2:08 PMJavier del Villar
05/10/2023, 2:11 PMscala> spark.read.csv("C:/[REDACTED]/data/01_raw/csv_folder")
java.lang.UnsatisfiedLinkError: org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Ljava/lang/String;I)Z
at org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Native Method)
at org.apache.hadoop.io.nativeio.NativeIO$Windows.access(NativeIO.java:793)
at org.apache.hadoop.fs.FileUtil.canRead(FileUtil.java:1218)
at org.apache.hadoop.fs.FileUtil.list(FileUtil.java:1423)
at org.apache.hadoop.fs.RawLocalFileSystem.listStatus(RawLocalFileSystem.java:601)
at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1972)
at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:2014)
at org.apache.hadoop.fs.ChecksumFileSystem.listStatus(ChecksumFileSystem.java:761)
at org.apache.spark.util.HadoopFSUtils$.listLeafFiles(HadoopFSUtils.scala:225)
at org.apache.spark.util.HadoopFSUtils$.$anonfun$parallelListLeafFilesInternal$1(HadoopFSUtils.scala:95)
at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at scala.collection.TraversableLike.map(TraversableLike.scala:286)
at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
at scala.collection.AbstractTraversable.map(Traversable.scala:108)
at org.apache.spark.util.HadoopFSUtils$.parallelListLeafFilesInternal(HadoopFSUtils.scala:85)
at org.apache.spark.util.HadoopFSUtils$.parallelListLeafFiles(HadoopFSUtils.scala:69)
at org.apache.spark.sql.execution.datasources.InMemoryFileIndex$.bulkListLeafFiles(InMemoryFileIndex.scala:158)
at org.apache.spark.sql.execution.datasources.InMemoryFileIndex.listLeafFiles(InMemoryFileIndex.scala:131)
at org.apache.spark.sql.execution.datasources.InMemoryFileIndex.refresh0(InMemoryFileIndex.scala:94)
at org.apache.spark.sql.execution.datasources.InMemoryFileIndex.<init>(InMemoryFileIndex.scala:66)
at org.apache.spark.sql.execution.datasources.DataSource.createInMemoryFileIndex(DataSource.scala:567)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:409)
at org.apache.spark.sql.DataFrameReader.loadV1Source(DataFrameReader.scala:228)
at org.apache.spark.sql.DataFrameReader.$anonfun$load$2(DataFrameReader.scala:210)
at scala.Option.getOrElse(Option.scala:189)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:210)
at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:537)
at org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:443)
... 47 elided
scala> spark.read.csv("C:/[REDACTED]/data/01_raw/csv_folder/file1.csv")
23/05/10 11:10:18 WARN SizeEstimator: Failed to check whether UseCompressedOops is set; assuming yes
res4: org.apache.spark.sql.DataFrame = [_c0: string, _c1: string ... 135 more fields]
scala> spark.read.csv("C:\\[REDACTED]\\data\\01_raw\\csv_folder\\file*.csv")
Exception in thread "globPath-ForkJoinPool-10-worker-57" java.lang.UnsatisfiedLinkError: org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Ljava/lang/String;I)Z
at org.apache.hadoop.io.nativeio.NativeIO$Windows.access0(Native Method)
at org.apache.hadoop.io.nativeio.NativeIO$Windows.access(NativeIO.java:793)
at org.apache.hadoop.fs.FileUtil.canRead(FileUtil.java:1218)
at org.apache.hadoop.fs.FileUtil.list(FileUtil.java:1423)
at org.apache.hadoop.fs.RawLocalFileSystem.listStatus(RawLocalFileSystem.java:601)
at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:1972)
at org.apache.hadoop.fs.FileSystem.listStatus(FileSystem.java:2014)
at org.apache.hadoop.fs.ChecksumFileSystem.listStatus(ChecksumFileSystem.java:761)
at org.apache.hadoop.fs.Globber.listStatus(Globber.java:128)
at org.apache.hadoop.fs.Globber.doGlob(Globber.java:291)
at org.apache.hadoop.fs.Globber.glob(Globber.java:202)
at org.apache.hadoop.fs.FileSystem.globStatus(FileSystem.java:2124)
at org.apache.spark.deploy.SparkHadoopUtil.globPath(SparkHadoopUtil.scala:253)
at org.apache.spark.sql.execution.datasources.DataSource$.$anonfun$checkAndGlobPathIfNecessary$3(DataSource.scala:765)
at org.apache.spark.util.ThreadUtils$.$anonfun$parmap$2(ThreadUtils.scala:372)
at scala.concurrent.Future$.$anonfun$apply$1(Future.scala:659)
at scala.util.Success.$anonfun$map$1(Try.scala:255)
at scala.util.Success.map(Try.scala:213)
at scala.concurrent.Future.$anonfun$map$1(Future.scala:292)
at scala.concurrent.impl.Promise.liftedTree1$1(Promise.scala:33)
at scala.concurrent.impl.Promise.$anonfun$transform$1(Promise.scala:33)
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:64)
at java.util.concurrent.ForkJoinTask$RunnableExecuteAction.exec(Unknown Source)
at java.util.concurrent.ForkJoinTask.doExec(Unknown Source)
at java.util.concurrent.ForkJoinPool$WorkQueue.runTask(Unknown Source)
at java.util.concurrent.ForkJoinPool.runWorker(Unknown Source)
at java.util.concurrent.ForkJoinWorkerThread.run(Unknown Source)